# ---------------------------------------------------------------------- # Comparing etymological distributions # ---------------------------------------------------------------------- # Step 1. Normilize the etymology counts by token-row length def normalize_etymologies(row): total_tokens = len(row['tokens']) return {k: v / total_tokens for k, v in row['etymology_counts'].items()} odysseys_df['normalized_etymology'] = odysseys_df.apply(normalize_etymologies, axis=1) # Step 2. Aggregate the normalized etymologies by translator translator_agg = defaultdict(lambda: defaultdict(float)) for _, row in odysseys_df.iterrows(): translator = row['translator'] for k, v in row['normalized_etymology'].items(): translator_agg[translator][k] += v agg_df = pd.DataFrame(translator_agg).fillna(0).T # shape: (n_translators, n_roots) # Step 3. Filter for the top 7 roots top_roots = [ 'ang', 'lat', 'grc'] filtered_df = agg_df[top_roots] # Step 3. Normalize so rows sum to 1 (per translator) prop_df = filtered_df.div(filtered_df.sum(axis=1), axis=0) # Step 4. Plot the results prop_df.plot(kind='bar', stacked=True) plt.title("Etymology Root Proportions by Translator") plt.ylabel("Proportion") plt.xlabel("Translator") plt.legend(title="Etymology Root", bbox_to_anchor=(1.05, 1), loc='upper left') plt.tight_layout() plt.show()