import autotime # Provision for anxious people
%load_ext autotime

time: 66.4 μs (started: 2025-04-22 02:19:37 +02:00)

# ----------------------------------------------------------------------
# Baic Libraries
# ----------------------------------------------------------------------

import sys 
import os

import ast
from collections import Counter
from collections import defaultdict
from itertools import combinations

import re
import nltk

import numpy as np
import pandas as pd

import statsmodels.formula.api as smf
import statsmodels.api as sm
import scipy.stats as stats
from scipy.stats import shapiro, kstest, normaltest
from scipy.spatial.distance import cosine

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import scipy.cluster.hierarchy as sch

time: 2.02 s (started: 2025-04-22 02:19:37 +02:00)

# ----------------------------------------------------------------------
# Personalized Visualization & Functions
# ---------------------------------------------------------------------- 

sys.path.append('/Users/debr/English-Homer/functions') 
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap

import e_chroma as chroma # My Vizualization library
import e_plots as oz      # My custom plots library
import e_pandisplay as pan# My pandas display options

import e_nlp_ody as e     # Import my nlp functions

import warnings           # Nononsense provision
warnings.filterwarnings('ignore')

* Got some chroma in your soma, Oma!
	 »----> use chroma.save_figure(fig, 'my_plot')
Default output path: ./Homer_xplots/

* OZ is behind the curtain!
	 »----> use oz.<func>
	 »----> also, oz goes chroma (for styling)!

*Has Pan taken over?
✓ Pandas display set to e_pandisplay defaults!
	 »----> use pan.<func>

* The editor is in the house!
	 »----> use e.<func> e.g. nlp = e.NLPPipeline(language='english')

Stopwords customized:
  Added: {'being', 'seven', 'six', 'of', 'said', 'them', 'this', 'four', 'five', 'he', 'n', 'are', 'three', 'is', "'and", 'it', 'nine', 'upon', "'", 'eight', 'she', 'they', 'two', 'ten', 'was', 'one', 'mr', 'there', 'be', 'been', 'that', 'mrs', 'were'}
  Removed: {''}
  Total stopwords: 215
Punctuation customized:
  Keeping: {'', '-'}
  Additional removals: {'—', "'", '…', '-', '\\', ',\n        "\'",\n        '}
  Punctuation to be removed: !"#$%&'()*+,,
        "'",
        -./:;<=>?@[\]^_`{|}~—…
time: 644 ms (started: 2025-04-22 02:19:39 +02:00)

[nltk_data] Downloading package punkt_tab to /Users/debr/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!

# ----------------------------------------------------------------------
# File management
# ----------------------------------------------------------------------

# TO UPDATE
nb_id = "lexical_B01"

output_path = f"./"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
output_path_plots = f"./{output_path}/{nb_id}_plots/"
os.makedirs(os.path.dirname(output_path_plots), exist_ok=True)
chroma.set_output_path(output_path_plots)

Output path set to: ././/lexical_B01_plots/
time: 659 μs (started: 2025-04-22 02:19:40 +02:00)

# ----------------------------------------------------------------------
# Odysseys
# ----------------------------------------------------------------------

translators = ['AT_Murray', 'Fitzgerald', 'Lattimore', 'Fagles', 'Wilson', 'Green', 'Woolf']

dfs = []

for odyssey in translators:
    filepath = f"/Users/debr/odysseys_en/dataframed/Odyssey_{odyssey}_DataFrame.csv"
    temp_df = pd.read_csv(filepath)  
    dfs.append(temp_df)  # Append it to the list

df = pd.concat(dfs, axis=0, ignore_index=True)

df["text"] = df["text"].apply(ast.literal_eval)
df["tokens"] = df["tokens"].apply(ast.literal_eval)
df['translator'] = pd.Categorical(df['author'])
df["book_num"] = pd.Categorical(df["book_num"])
df = df[['translator', 'book_num', 'text', 'tokens', 'num_words', 'num_tokens']]

# ----------------------------------------------------------------------
# Backup dataframe only 'translator', 'book_num', 'text', 'tokens', columns
# ----------------------------------------------------------------------
df_bkp = df[['translator', 'book_num', 'text', 'tokens']].copy()
# ----------------------------------------------------------------------
# Dataframe check
# ----------------------------------------------------------------------

e.check_df(df)

Mr righteous here has no missing values!

* df columns: Index(['translator', 'book_num', 'text', 'tokens', 'num_words', 'num_tokens'], dtype='object') 

* Shape: (168, 6) 

* Total memory in MB: 4.064749
time: 698 ms (started: 2025-04-22 02:19:40 +02:00)

# ----------------------------------------------------------------------
# Create a color mapping for translators
# ----------------------------------------------------------------------

translators = df["translator"].unique()
palette = {translator: color for translator, 
           color in zip(translators, chroma.color_palette.values())}

plt.figure(figsize=(6, 1))

for i, (translator, color) in enumerate(palette.items()):
    plt.scatter(i, 0, color=color, s=300)

plt.xticks(range(len(palette)), palette.keys(), fontsize=8)
plt.yticks([])
plt.title("Color Mapping Translators", fontsize=11)
plt.savefig(f"{output_path_plots}translator_color_mapping-{nb_id}.png")
plt.show()

time: 153 ms (started: 2025-04-22 02:19:40 +02:00)

# ----------------------------------------------------------------------
# Probing ety
# ----------------------------------------------------------------------
import ety
# Get the etymology of the word "muse" in English
word = "muse"
ety_origin = ety.origins(word)
ety_tree = ety.tree(word)
print(f"Etymology of '{word}': {ety_origin}")
print(f"Etymology tree of '{word}': {ety_tree}")

Etymology of 'muse': [Word(muse, Middle French (ca. 1400-1600) [frm])]
Etymology tree of 'muse': muse (English)
└── muse (Middle French (ca. 1400-1600))
    └── Musa (Latin)
        └── Μοῦσα (Ancient Greek (to 1453))
time: 528 ms (started: 2025-04-22 02:19:40 +02:00)

# Get the etymology of the word "table" in English

word = "table"
ety_origin = ety.origins(word)
ety_tree = ety.tree(word)
print(f"Etymology of '{word}': {ety_origin}")
print(f"Etymology tree of '{word}': {ety_tree}")

Etymology of 'table': [Word(table, Middle English (1100-1500) [enm])]
Etymology tree of 'table': table (English)
└── table (Middle English (1100-1500))
time: 517 μs (started: 2025-04-22 02:19:41 +02:00)

# ----------------------------------------------------------------------
# Getting the full dictionary
# ----------------------------------------------------------------------

etymologypath = "/Users/debr/odysseys_en/etymwn-20130208/etymwn.tsv"

etymology_df = pd.read_csv(etymologypath, sep="\t", names=["word", "relation", "target_word"], encoding="utf-8")
etymology_df["relation"] = etymology_df["relation"].astype("category") # to save space

e.check_df(etymology_df)

Mr righteous here has no missing values!

* df columns: Index(['word', 'relation', 'target_word'], dtype='object') 

* Shape: (6031431, 3) 

* Total memory in MB: 922.924706
time: 4.72 s (started: 2025-04-22 02:19:41 +02:00)

etymology_df.sample(10, random_state=402)

time: 133 ms (started: 2025-04-22 02:19:46 +02:00)

relations = [etymology_df['relation'].unique()]
relations

[['rel:etymological_origin_of', 'rel:has_derived_form', 'rel:is_derived_from', 'rel:etymology', 'rel:etymologically_related', 'rel:variant:orthography', 'rel:derived', 'rel:etymologically']
 Categories (8, object): ['rel:derived', 'rel:etymological_origin_of', 'rel:etymologically', 'rel:etymologically_related', 'rel:etymology', 'rel:has_derived_form', 'rel:is_derived_from', 'rel:variant:orthography']]

time: 57.4 ms (started: 2025-04-22 02:19:46 +02:00)

target_word = "enm: table"  # Change this to any word you want to match

etymology_root_1 = etymology_df[etymology_df["word"] == target_word]

print(etymology_root_1)

        word        relation                    target_word
1345511  enm: table  rel:etymological_origin_of  eng: table
time: 180 ms (started: 2025-04-22 02:19:46 +02:00)

# ----------------------------------------------------------------------
# A subset dictionary based on "relation"
# ----------------------------------------------------------------------

relation_etymology_df = etymology_df[etymology_df["relation"] == "rel:etymology"]
relation_etymology_df.sample(8, random_state=42)

time: 37 ms (started: 2025-04-22 02:19:46 +02:00)

# ----------------------------------------------------------------------
# Chasing the etymology tree
# ----------------------------------------------------------------------

target_word = "eng: ghost"
etymology_root_1 = relation_etymology_df[relation_etymology_df["word"] == target_word]
print("Root 1:", etymology_root_1)

target_word = "enm: gost" 
etymology_root_2 = relation_etymology_df[relation_etymology_df["word"] == target_word]
print("\nRoot 2:", etymology_root_2)

target_word = "ang: gast" # last branch registered
etymology_root_3 = relation_etymology_df[relation_etymology_df["word"] == target_word]
print("\nRoot 3:", etymology_root_3)

Root 1:        word        relation       target_word
761941  eng: ghost  rel:etymology  enm: gost 

Root 2:         word       relation       target_word
1342198  enm: gost  rel:etymology  ang: gast 

Root 3: Empty DataFrame
Columns: [word, relation, target_word]
Index: []
time: 228 ms (started: 2025-04-22 02:19:46 +02:00)

# ----------------------------------------------------------------------
# Function to get the etymology tree
# ----------------------------------------------------------------------

def trace_etymology(word, df):
    etymology_chain = [word]  # Store the lineage of words
    
    while True:
        # Find the row where 'word' matches
        etymology_row = df[df["word"] == word]
        
        # If no match is found, stop the loop
        if etymology_row.empty:
            break
        
        # Get the next word in the etymological chain
        next_word = etymology_row["target_word"].values[0]
        
        # Append to the chain and set up for the next iteration
        etymology_chain.append(next_word)
        word = next_word  # Set the next search target

    # Format output sentence
    if len(etymology_chain) > 1:
        etymology_str = ' → '.join(etymology_chain)
        print(f'The word "{etymology_chain[0]}" traces back through: {etymology_str}.')
    else:
        print(f'No etymological root found for "{word}".')

# Example usage
trace_etymology("eng: ghost", relation_etymology_df)

The word "eng: ghost" traces back through: eng: ghost → enm: gost → ang: gast.
time: 59.8 ms (started: 2025-04-22 02:19:46 +02:00)

# ----------------------------------------------------------------------
# The function in a loop for a list of words
# ----------------------------------------------------------------------

ety_inquiries = ['muse', 'ghost', 'table', 'sword', 'shield', 'spear', 'battle', 'warrior', 'hero', 'goddess']

for word in ety_inquiries:
    print(f"Word: {word}")
    trace_etymology(f"eng: {word}", relation_etymology_df)
    print()

Word: muse
The word "eng: muse" traces back through: eng: muse → frm: muse → lat: Musa → grc: Μοῦσα.

Word: ghost
The word "eng: ghost" traces back through: eng: ghost → enm: gost → ang: gast.

Word: table
The word "eng: table" traces back through: eng: table → enm: table.

Word: sword
The word "eng: sword" traces back through: eng: sword → enm: sword.

Word: shield
The word "eng: shield" traces back through: eng: shield → ang: scieldan.

Word: spear
The word "eng: spear" traces back through: eng: spear → ang: spere.

Word: battle
The word "eng: battle" traces back through: eng: battle → enm: batel → fro: bataille.

Word: warrior
No etymological root found for "eng: warrior".

Word: hero
No etymological root found for "eng: hero".

Word: goddess
The word "eng: goddess" traces back through: eng: goddess → eng: -ess → fra: -esse → lat: -issa → grc: -ισσα.

time: 506 ms (started: 2025-04-22 02:19:47 +02:00)

# ----------------------------------------------------------------------
# Deadends in the dictionary
# ----------------------------------------------------------------------

# Some words get lost between pre- and sufixes. E.g., "goddess"

word = "goddess"

# ----------------------------------------------------------------------
# ety module
# ----------------------------------------------------------------------

ety_origin = ety.origins(word)
ety_tree = ety.tree(word)
print(f"Etymology of '{word}': {ety_origin}")
print(f"Etymology tree of '{word}': {ety_tree}")

# ----------------------------------------------------------------------
# my function:
# ----------------------------------------------------------------------
print() 
trace_etymology(f"eng: {word}", relation_etymology_df)

Etymology of 'goddess': [Word(-ess, English [eng]), Word(god, English [eng])]
Etymology tree of 'goddess': goddess (English)
├── -ess (English)
│   └── -esse (French)
│       ├── -issa (Latin)
│       │   └── -ισσα (Ancient Greek (to 1453))
│       └── -itiam (Latin)
└── god (English)

The word "eng: goddess" traces back through: eng: goddess → eng: -ess → fra: -esse → lat: -issa → grc: -ισσα.
time: 113 ms (started: 2025-04-22 02:19:47 +02:00)

word = "table"

# ----------------------------------------------------------------------
# ety module
# ----------------------------------------------------------------------

ety_origin = ety.origins(word)
ety_tree = ety.tree(word)
print(f"Etymology of '{word}': {ety_origin}")
print(f"Etymology tree of '{word}': {ety_tree}")

# ----------------------------------------------------------------------
# my function:
# ----------------------------------------------------------------------
print() 
trace_etymology(f"eng: {word}", relation_etymology_df)

Etymology of 'table': [Word(table, Middle English (1100-1500) [enm])]
Etymology tree of 'table': table (English)
└── table (Middle English (1100-1500))

The word "eng: table" traces back through: eng: table → enm: table.
time: 45.5 ms (started: 2025-04-22 02:19:47 +02:00)

# ----------------------------------------------------------------------
# Illustrating deadends
# ----------------------------------------------------------------------

target_word = "eng: table"  
ety_deadend = "enm: table" # Middle English
etymology_root_1 = etymology_df[etymology_df["word"] == target_word]
etymology_root_2 = etymology_df[etymology_df["word"] == ety_deadend]
print("Slice of etymology database:")
print(etymology_root_1[24:30])
print("\nIllustrating deadend in etymology database:")
print(etymology_root_2)

Slice of etymology database:
        word        relation                    target_word             
1221570  eng: table  rel:etymologically_related              eng: tablet
1221571  eng: table  rel:etymologically_related            eng: tabulate
1221572  eng: table               rel:etymology               enm: table
1221573  eng: table        rel:has_derived_form        eng: Cayley table
1221574  eng: table        rel:has_derived_form      eng: billiard table
1221575  eng: table        rel:has_derived_form  eng: bring to the table

Illustrating deadend in etymology database:
        word        relation                    target_word
1345511  enm: table  rel:etymological_origin_of  eng: table
time: 472 ms (started: 2025-04-22 02:19:47 +02:00)

hero_etym = etymology_df[etymology_df["word"] == "eng: hero"]
print(hero_etym)

       word       relation                    target_word      
795625  eng: hero  rel:etymological_origin_of    eng: anti-hero
795626  eng: hero  rel:etymological_origin_of     eng: antihero
795627  eng: hero  rel:etymological_origin_of    eng: cyberhero
795628  eng: hero  rel:etymological_origin_of      eng: heroess
795629  eng: hero  rel:etymological_origin_of     eng: heroical
795630  eng: hero  rel:etymological_origin_of     eng: herolike
795631  eng: hero  rel:etymological_origin_of     eng: heroship
795632  eng: hero  rel:etymological_origin_of     eng: megahero
795633  eng: hero  rel:etymological_origin_of      eng: nonhero
795634  eng: hero  rel:etymological_origin_of        eng: shero
795635  eng: hero  rel:etymological_origin_of    eng: superhero
795636  eng: hero  rel:etymologically_related       eng: heroic
795637  eng: hero  rel:etymologically_related      eng: heroics
795638  eng: hero  rel:etymologically_related      eng: heroine
795639  eng: hero  rel:etymologically_related      eng: heroism
795640  eng: hero        rel:has_derived_form  eng: action hero
795641  eng: hero        rel:has_derived_form     eng: antihero
795642  eng: hero        rel:has_derived_form       eng: heroes
795643  eng: hero        rel:has_derived_form       eng: heroic
795644  eng: hero        rel:has_derived_form      eng: heroics
795645  eng: hero        rel:has_derived_form      eng: heroise
795646  eng: hero        rel:has_derived_form      eng: heroism
795647  eng: hero        rel:has_derived_form    eng: superhero
795648  eng: hero        rel:has_derived_form  eng: unsung hero
time: 175 ms (started: 2025-04-22 02:19:48 +02:00)

# ----------------------------------------------------------------------
# 1. Unique words in Odysseys
# ----------------------------------------------------------------------

tokens_set = set().union(*df['tokens'])
etymolo_list_ody = list(tokens_set)
etymolo_list_ody.sort()
print(len(etymolo_list_ody))
print(etymolo_list_ody[:11])

20543
['aback', 'abandon', 'abandoned', 'abandoning', 'abased', 'abashed', 'abated', 'abating', 'abc', 'abeam', 'abed']
time: 34.7 ms (started: 2025-04-22 02:19:48 +02:00)

#----------------------------------------------------------------------
# 2. polishing our function
#----------------------------------------------------------------------

def get_etymology(word, df):
    etymology_chain = [word]  # Store the lineage of words
    
    while True:
        # Find the row where 'word' matches
        etymology_row = df[df["word"] == word]
        
        # If no match is found, stop the loop
        if etymology_row.empty:
            break
        
        # Get the next word in the etymological chain
        next_word = etymology_row["target_word"].values[-1] # 
        
        # Append to the chain and set up for the next iteration
        etymology_chain.append(next_word)
        word = next_word  # Set the next search target

    return etymology_chain[-1:]

#----------------------------------------------------------------------
# Testing with toy list
#----------------------------------------------------------------------

toy_list = ['spear', 'muse', 'shield', 'abandoning', 'abeam', 'warrior', 'goddess', 'hero']
toy_ety = []
for word in toy_list:
    etymology_chain = get_etymology(f"eng: {word}", relation_etymology_df)  # Store result here
    toy_ety.append(etymology_chain)  # Append to the list   
    print(f'The word "{word}" traces back through: {etymology_chain}.')
    print()

The word "spear" traces back through: ['ang: spere'].

The word "muse" traces back through: ['grc: Μοῦσα'].

The word "shield" traces back through: ['ang: scield'].

The word "abandoning" traces back through: ['eng: abandoning'].

The word "abeam" traces back through: ['ang: byme'].

The word "warrior" traces back through: ['eng: warrior'].

The word "goddess" traces back through: ['eng: god'].

The word "hero" traces back through: ['eng: hero'].

time: 402 ms (started: 2025-04-22 02:19:48 +02:00)

god_etym = etymology_df[etymology_df["word"] == "eng: god"]
print(god_etym)

       word      relation                    target_word          
768225  eng: god  rel:etymological_origin_of            eng: begod
768226  eng: god  rel:etymological_origin_of         eng: godchild
768227  eng: god  rel:etymological_origin_of          eng: goddess
768228  eng: god  rel:etymological_origin_of        eng: godfather
768229  eng: god  rel:etymological_origin_of          eng: godless
768230  eng: god  rel:etymological_origin_of          eng: godlike
768231  eng: god  rel:etymological_origin_of          eng: godling
768232  eng: god  rel:etymological_origin_of          eng: godlore
768233  eng: god  rel:etymological_origin_of          eng: godmama
768234  eng: god  rel:etymological_origin_of          eng: godpapa
768235  eng: god  rel:etymological_origin_of          eng: godship
768236  eng: god  rel:etymological_origin_of           eng: nongod
768237  eng: god  rel:etymological_origin_of         eng: undergod
768238  eng: god  rel:etymological_origin_of            eng: ungod
768239  eng: god        rel:has_derived_form              eng: God
768240  eng: god        rel:has_derived_form  eng: God of the gaps
768241  eng: god        rel:has_derived_form        eng: God-given
768242  eng: god        rel:has_derived_form          eng: Goddess
768243  eng: god        rel:has_derived_form         eng: Godspeed
768244  eng: god        rel:has_derived_form          eng: demigod
768245  eng: god        rel:has_derived_form       eng: god forbid
768246  eng: god        rel:has_derived_form         eng: god king
768247  eng: god        rel:has_derived_form        eng: god-awful
768248  eng: god        rel:has_derived_form        eng: god-child
768249  eng: god        rel:has_derived_form      eng: god-fearing
768250  eng: god        rel:has_derived_form     eng: god-forsaken
768251  eng: god        rel:has_derived_form         eng: god-king
768252  eng: god        rel:has_derived_form         eng: godchild
768253  eng: god        rel:has_derived_form           eng: goddam
768254  eng: god        rel:has_derived_form          eng: goddamn
768255  eng: god        rel:has_derived_form      eng: goddaughter
768256  eng: god        rel:has_derived_form           eng: godded
768257  eng: god        rel:has_derived_form          eng: goddess
768258  eng: god        rel:has_derived_form          eng: godding
768259  eng: god        rel:has_derived_form        eng: godfather
768260  eng: god        rel:has_derived_form      eng: godforsaken
768261  eng: god        rel:has_derived_form          eng: godhead
768262  eng: god        rel:has_derived_form          eng: godhood
768263  eng: god        rel:has_derived_form          eng: godless
768264  eng: god        rel:has_derived_form          eng: godlike
768265  eng: god        rel:has_derived_form        eng: godliness
768266  eng: god        rel:has_derived_form          eng: godling
768267  eng: god        rel:has_derived_form            eng: godly
768268  eng: god        rel:has_derived_form        eng: godmother
768269  eng: god        rel:has_derived_form        eng: godparent
768270  eng: god        rel:has_derived_form             eng: gods
768271  eng: god        rel:has_derived_form          eng: godsend
768272  eng: god        rel:has_derived_form          eng: godship
768273  eng: god        rel:has_derived_form           eng: godson
768274  eng: god        rel:has_derived_form          eng: godward
768275  eng: god        rel:has_derived_form    eng: household god
768276  eng: god        rel:has_derived_form          eng: ungodly
time: 189 ms (started: 2025-04-22 02:19:48 +02:00)

# ----------------------------------------------------------------------
# Step 3.1 Etymology DataFrame Testing
# ----------------------------------------------------------------------

len(toy_list), len(toy_ety)

etymology_df = pd.DataFrame(toy_list, columns=["word"])
etymology_df["etymology"] = toy_ety
etymology_df.T

time: 205 ms (started: 2025-04-22 02:19:49 +02:00)

# ----------------------------------------------------------------------
# Step 3.2: Etymology DataFrame Construction
# ----------------------------------------------------------------------
# This step extracts the etymological root of each word in the Odyssey-related
# vocabulary list (`etymolo_list_ody`) by tracing the etymology chain using
# the `relation_etymology_df` DataFrame.

# The process involves:
# 1. Defining a cycle-aware `get_etymology` function that recursively follows
#    'target_word' links in the etymological dictionary, with cycle detection
#    to prevent infinite loops.
# 2. Iterating through each word in `etymolo_list_ody`, retrieving the final 
#    etymological form (deepest known origin), and storing it.
# 3. Constructing a new DataFrame `etymology_df` with the format:
#    | word | etymology |
# ----------------------------------------------------------------------

# Counter for cycle warnings
cycle_warning_count = 0

# 1. Optimized get_etymology function with cycle detection
def get_etymology(word, df):
    global cycle_warning_count  # Use global to modify the outer counter
    etymology_chain = [word]
    visited_words = set()

    while True:
        if word in visited_words:
            print(f"Warning: Cycle detected for '{word}'. Stopping to prevent infinite loop.")
            cycle_warning_count += 1
            break
        visited_words.add(word)

        etymology_row = df[df["word"] == word]
        if etymology_row.empty:
            break

        try:
            next_word = etymology_row["target_word"].values[-1]
            etymology_chain.append(next_word)
            word = next_word
        except Exception as e:
            print(f"Error processing word '{word}': {e}")
            break

    return etymology_chain[-1:]
    # return etymology_chain[-1:], cycle_detected # alt return for modularity

# 2. Process all words
ety_list = []

for i, word in enumerate(etymolo_list_ody, start=1):  
    try:
        etymology_chain = get_etymology(f"eng: {word}", relation_etymology_df)
        ety_list.append(etymology_chain)  
    except Exception as e:
        print(f"Skipping '{word}' due to error: {e}")
        ety_list.append([])

    if i % 500 == 0:
        print(f"Processed {i} words...")

print("Processing complete!")
print(f"Number of 'Warning: Cycle detected': {cycle_warning_count}")

# 3. Create the DataFrame
etymology_df = pd.DataFrame({"word": etymolo_list_ody, "etymology": ety_list})

# 4. Check a sample of the DataFrame
print(etymology_df.sample())

Processed 500 words...
Processed 1000 words...
Processed 1500 words...
Warning: Cycle detected for 'eng: boom'. Stopping to prevent infinite loop.
Processed 2000 words...
Processed 2500 words...
Warning: Cycle detected for 'lat: caballus'. Stopping to prevent infinite loop.
Processed 3000 words...
Warning: Cycle detected for 'enm: cluster'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'lat: conductus'. Stopping to prevent infinite loop.
Processed 3500 words...
Warning: Cycle detected for 'lat: construo'. Stopping to prevent infinite loop.
Processed 4000 words...
Warning: Cycle detected for 'lat: cortina'. Stopping to prevent infinite loop.
Processed 4500 words...
Processed 5000 words...
Warning: Cycle detected for 'lat: districtus'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'lat: educo'. Stopping to prevent infinite loop.
Processed 5500 words...
Processed 6000 words...
Warning: Cycle detected for 'lat: faber'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'lat: femella'. Stopping to prevent infinite loop.
Processed 6500 words...
Processed 7000 words...
Processed 7500 words...
Processed 8000 words...
Processed 8500 words...
Processed 9000 words...
Processed 9500 words...
Processed 10000 words...
Processed 10500 words...
Processed 11000 words...
Warning: Cycle detected for 'fra: mine'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'enm: moiste'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'enm: moiste'. Stopping to prevent infinite loop.
Processed 11500 words...
Processed 12000 words...
Processed 12500 words...
Warning: Cycle detected for 'lat: piper'. Stopping to prevent infinite loop.
Processed 13000 words...
Warning: Cycle detected for 'fra: plan'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'fra: plan'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'enm: povre'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'enm: povre'. Stopping to prevent infinite loop.
Processed 13500 words...
Warning: Cycle detected for 'lat: privatus'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'lat: privatus'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'lat: privatus'. Stopping to prevent infinite loop.
Processed 14000 words...
Processed 14500 words...
Processed 15000 words...
Processed 15500 words...
Processed 16000 words...
Processed 16500 words...
Processed 17000 words...
Warning: Cycle detected for 'ang: stille'. Stopping to prevent infinite loop.
Processed 17500 words...
Processed 18000 words...
Warning: Cycle detected for 'lat: titulus'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'lat: titulus'. Stopping to prevent infinite loop.
Processed 18500 words...
Processed 19000 words...
Processed 19500 words...
Processed 20000 words...
Processed 20500 words...
Processing complete!
Number of 'Warning: Cycle detected': 24
     word    etymology     
6470  fellow  [non: fēlagi]
time: 10min 6s (started: 2025-04-22 02:19:49 +02:00)

# ----------------------------------------------------------------------
# Percentage of words missing etymology
# ----------------------------------------------------------------------
warnings = cycle_warning_count 
total_words = len(etymolo_list_ody)
warning_percentage = (warnings / total_words) * 100
print(f"Percentage of warnings: {warning_percentage:.2f}%")

Percentage of warnings: 0.12%
time: 1.23 ms (started: 2025-04-22 02:29:55 +02:00)

# ----------------------------------------------------------------------
# Milestone: Etymology DataFrame
# ----------------------------------------------------------------------
e.check_df(etymology_df)

# Save the DataFrame as a TSV file
etymology_df.to_csv('/Users/debr/odysseys_en/word_ety_odysseys.tsv',
                     sep='\t', index=False, encoding='utf-8')

Mr righteous here has no missing values!

* df columns: Index(['word', 'etymology'], dtype='object') 

* Shape: (20543, 2) 

* Total memory in MB: 2.807876
time: 38.4 ms (started: 2025-04-22 02:29:55 +02:00)

# Step 1. Explode tokens column

odysseys_df = df_bkp.copy()
etymologypath = '/Users/debr/odysseys_en/word_ety_odysseys.tsv'

etymology_df = pd.read_csv(etymologypath, sep="\t", names=["word", "etymology"], encoding="utf-8")
etymology_df['etymology'] = etymology_df['etymology'].str.extract(r"\['(\w{3})").astype("category")

etymology_df.sample(8, random_state=42)

time: 224 ms (started: 2025-04-22 02:29:55 +02:00)

# ----------------------------------------------------------------------
# Labelling the etymology of every word in all our Odysseys
# ----------------------------------------------------------------------

# Step 1. Explode tokens column 
odyssey_exploded = odysseys_df.explode("tokens")

# Merge with etymology_df to get etymology
odyssey_exploded = odyssey_exploded\
                            .merge(etymology_df[['word', 'etymology']], 
                            left_on='tokens', right_on='word', how='left')

# Group back by translator and book_num to form lists of etymology
odysseys_df['etymology'] = odyssey_exploded\
                            .groupby(['translator', 'book_num'],
                                      observed=True)['etymology']\
                                    .apply(list).reset_index(drop=True)

odysseys_df['etymology_counts'] = odysseys_df['etymology'].apply(lambda x: dict(Counter(x)))
odysseys_df.sample(6, random_state=42)

time: 142 ms (started: 2025-04-22 02:29:55 +02:00)

#----------------------------------------------------------------------------
# Power law distribution plot
#----------------------------------------------------------------------------

def process_and_plot_zipfs_law(odysseys_df):
    """
    Processes the 'etymology_counts' column to extract word frequencies,
    computes Zipf's Law distribution, and plots it.
    """
    translator_groups = odysseys_df.groupby("translator") 
    word_freq_rank_dict = {}

    for translator, group in translator_groups:
        combined_counts = {}

        # Aggregate word frequencies across books for current translator
        for counts_dict in group["etymology_counts"]:
            for word, count in counts_dict.items():
                combined_counts[word] = combined_counts.get(word, 0) + count

        # Convert to sorted DataFrame (descending frequency)
        df_zipf = pd.DataFrame(
            sorted(combined_counts.items(), key=lambda x: x[1], reverse=True),
            columns=["word", "frequency"]
        )
        df_zipf["rank"] = np.arange(1, len(df_zipf) + 1)
        word_freq_rank_dict[translator] = df_zipf

    # Plot the results
    for translator, df_zipf in word_freq_rank_dict.items():
        plt.loglog(df_zipf["rank"], df_zipf["frequency"], label=translator,
                    marker=".", linestyle="None", alpha=0.6, color=palette[translator])

    plt.xlabel("Rank (log scale)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Log vs log of Etymology Frequency vs Rank")
    plt.legend()
    plt.grid(True, linestyle="--", linewidth=0.3, alpha=0.3)
    plt.savefig(f"{output_path_plots}_Etymo_FreqRank-{nb_id}.png", dpi=300, bbox_inches='tight')
    plt.show()

# Example usage:
process_and_plot_zipfs_law(odysseys_df)

time: 1.04 s (started: 2025-04-22 02:29:55 +02:00)

# ----------------------------------------------------------------------
# Etymology frequency analysis
# ----------------------------------------------------------------------

def analyze_word_frequencies(odysseys_df):
    translator_groups = odysseys_df.groupby("translator", sort=False)
    
    results = {}

    for translator, group in translator_groups:
        all_frequencies = []

        for counts_dict in group["etymology_counts"]:
            all_frequencies.extend(counts_dict.values())

        # Convert to numpy array
        all_frequencies = np.array(all_frequencies)
        all_frequencies = all_frequencies[all_frequencies > 0]  # Remove zero values

        if len(all_frequencies) > 10:  # Ensure enough data points
            log_frequencies = np.log1p(all_frequencies)  # Log transform

            # KDE Plot
            sns.kdeplot(log_frequencies, fill=False, alpha=0.6, label=translator)

            # Statistical Normality Tests
            shapiro_test = shapiro(log_frequencies)  # Shapiro-Wilk Test
            
            ks_test = kstest(log_frequencies, 'norm', 
                             args=(np.mean(log_frequencies), 
                                   np.std(log_frequencies)))  # KS Test
            
            dagostino_test = normaltest(log_frequencies)  # D'Agostino K^2 Test

            results[translator] = {
                "Shapiro-Wilk p-value": shapiro_test.pvalue,
                "Kolmogorov-Smirnov p-value": ks_test.pvalue,
                "D'Agostino K^2 p-value": dagostino_test.pvalue
            }

    plt.xlabel("Log Word Frequency")
    plt.ylabel("Density")
    plt.title("Log-Transformed KDE Plot of Word Frequencies Across Translators")
    plt.legend(title="Translator")
    plt.grid(True, linestyle="--", linewidth=0.3, alpha=0.3)
    plt.show()

    # Print normality test results
    for translator, stats in results.items():
        print(f"\nTranslator: {translator}")
        for test, p_value in stats.items():
            print(f"  {test}: p = {p_value:.5f}")

# Example usage:
analyze_word_frequencies(odysseys_df)

Translator: AT_Murray
  Shapiro-Wilk p-value: p = 0.00000
  Kolmogorov-Smirnov p-value: p = 0.00000
  D'Agostino K^2 p-value: p = 0.00000

Translator: Fitzgerald
  Shapiro-Wilk p-value: p = 0.00000
  Kolmogorov-Smirnov p-value: p = 0.00000
  D'Agostino K^2 p-value: p = 0.00000

Translator: Lattimore
  Shapiro-Wilk p-value: p = 0.00000
  Kolmogorov-Smirnov p-value: p = 0.00000
  D'Agostino K^2 p-value: p = 0.00000

Translator: Fagles
  Shapiro-Wilk p-value: p = 0.00000
  Kolmogorov-Smirnov p-value: p = 0.00000
  D'Agostino K^2 p-value: p = 0.00000

Translator: Wilson
  Shapiro-Wilk p-value: p = 0.00000
  Kolmogorov-Smirnov p-value: p = 0.00000
  D'Agostino K^2 p-value: p = 0.00000

Translator: Green
  Shapiro-Wilk p-value: p = 0.00000
  Kolmogorov-Smirnov p-value: p = 0.00000
  D'Agostino K^2 p-value: p = 0.00000

Translator: Woolf
  Shapiro-Wilk p-value: p = 0.00000
  Kolmogorov-Smirnov p-value: p = 0.00000
  D'Agostino K^2 p-value: p = 0.00000
time: 458 ms (started: 2025-04-22 02:29:56 +02:00)

# ----------------------------------------------------------------------
# Comparing etymological distributions
# ----------------------------------------------------------------------

# Step 1. Normilize the etymology counts by token-row length

def normalize_etymologies(row):
    total_tokens = len(row['tokens']) 
    return {k: v / total_tokens for k, v in row['etymology_counts'].items()}

odysseys_df['normalized_etymology'] = odysseys_df.apply(normalize_etymologies, axis=1)

# Step 2. Aggregate the normalized etymologies by translator

translator_agg = defaultdict(lambda: defaultdict(float))

for _, row in odysseys_df.iterrows():
    translator = row['translator']
    for k, v in row['normalized_etymology'].items():
        translator_agg[translator][k] += v

agg_df = pd.DataFrame(translator_agg).fillna(0).T  # shape: (n_translators, n_roots)

# Step 3. Filter etymology roots (excluding eng)
top_roots = [ 'enm', 'ang', 'lat', 'fro', 'non', 'grc']
filtered_df = agg_df[top_roots]

# Step 3. Normalize so rows sum to 1 (per translator)
prop_df = filtered_df.div(filtered_df.sum(axis=1), axis=0)

# Step 4. Plot the results
prop_df.plot(kind='bar', stacked=True)
plt.title("Etymology Root Proportions by Translator")
plt.ylabel("Proportion", fontsize=14)
plt.xlabel("Translator", fontsize=14)
plt.xticks(rotation=0, fontsize=12)
plt.yticks(fontsize=12)
plt.axhline(0.5, color='green', linestyle='--', label='Baseline 0.5')
plt.legend(title="Etymology Root", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(f"{output_path_plots}Etymology_Root_Proportions-{nb_id}.png")
plt.show()

time: 904 ms (started: 2025-04-22 02:29:57 +02:00)

# ----------------------------------------------------------------------
# Comparing etymological distributions as proportions
# ----------------------------------------------------------------------

# Step 1. Normilize the etymology counts by token-row length
def normalize_etymologies(row):
    total_tokens = len(row['tokens']) 
    return {k: v / total_tokens for k, v in row['etymology_counts'].items()}

odysseys_df['normalized_etymology'] = odysseys_df.apply(normalize_etymologies, axis=1)

# Step 2. Aggregate the normalized etymologies by translator
translator_agg = defaultdict(lambda: defaultdict(float))

for _, row in odysseys_df.iterrows():
    translator = row['translator']
    for k, v in row['normalized_etymology'].items():
        translator_agg[translator][k] += v

agg_df = pd.DataFrame(translator_agg).fillna(0).T  # shape: (n_translators, n_roots)

# Step 3. Filter fand normalize
top_roots = [  'ang', 'lat']
filtered_df = agg_df[top_roots]
prop_df = filtered_df.div(filtered_df.sum(axis=1), axis=0)

# Step 4. Plot the results
prop_df.plot(kind='bar', stacked=True, color=['#B0124D', '#177070'])
plt.title("Etymology Root Proportions Anglo-Saxon vs Latin")
plt.xlabel("Translator", fontsize=14)
plt.xticks(rotation=0, fontsize=12)
plt.yticks(fontsize=12)
plt.ylabel("Proportion", fontsize=14)
plt.axhline(0.77, color='blue', linestyle='--', label='Baseline Fagles')
plt.grid(axis='y', linestyle='--', linewidth=0.5, alpha=0.7)
plt.legend(title="Etymology Root", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(f"{output_path_plots}Etymology_Root_Proportions_AngLat-{nb_id}.png")
plt.show()

time: 770 ms (started: 2025-04-22 02:29:58 +02:00)

#-------------------------------------------------------------------
# Etymology distributions to show stylistic divergence
#-------------------------------------------------------------------

# Step 1. Normilize the etymology counts by token-row length
def normalize_etymologies(row):
    total_tokens = len(row['tokens']) 
    return {k: v / total_tokens for k, v in row['etymology_counts'].items()}

odysseys_df['normalized_etymology'] = odysseys_df.apply(normalize_etymologies, axis=1)

# Step 2. Aggregate the normalized etymologies by translator
translator_agg = defaultdict(lambda: defaultdict(float))

for _, row in odysseys_df.iterrows():
    translator = row['translator']
    for k, v in row['normalized_etymology'].items():
        translator_agg[translator][k] += v

agg_df = pd.DataFrame(translator_agg).fillna(0).T  # shape: (n_translators, n_roots)

# Step 3. Filter for the top 7 roots
top_roots = ['eng', 'enm', 'ang', 'lat', 'fro', 'non', 'grc']
filtered_df = agg_df[top_roots]

# Step 4. Normalize so rows sum to 1 (per translator)
prop_df = filtered_df.div(filtered_df.sum(axis=1), axis=0)

# Step 5. Compare each translator to the mean distribution
mean_dist = prop_df.mean().values  # .values = 1D numpy array
prop_df['cosine_dist_from_mean'] = prop_df.apply(lambda row: cosine(row.values, mean_dist), axis=1)

#-------------------------------------------------------------------
# Output and viz
#-------------------------------------------------------------------

# Print with full precision to catch small differences
with pd.option_context('display.float_format', '{:.10f}'.format):
    print(prop_df[['cosine_dist_from_mean']])

# Sort and rename for plotting
prop_df_sorted = prop_df.sort_values("cosine_dist_from_mean", ascending=False).reset_index()
prop_df_sorted.rename(columns={"index": "translator"}, inplace=True)

sns.barplot(
    data=prop_df_sorted.reset_index(),
    x="cosine_dist_from_mean", y="translator", 
    palette=palette,
    hue="translator", dodge=False,
)
plt.xlabel("Cosine Distance from Mean Distribution")
plt.title("Etymology Stylistic Divergence")
plt.tight_layout()
plt.savefig(f"{output_path_plots}Etymo_Stylistic_Divergence-{nb_id}.png", dpi=300, bbox_inches='tight')
plt.show()

            cosine_dist_from_mean
AT_Murray  0.0024429419          
Fitzgerald 0.0000973649          
Lattimore  0.0003457566          
Fagles     0.0000901285          
Wilson     0.0009285934          
Green      0.0004112601          
Woolf      0.0061264767

time: 720 ms (started: 2025-04-22 02:29:58 +02:00)

#-------------------------------------------------------------------
# Etymology poles: anglo-saxon vs. latin
#-------------------------------------------------------------------

top_roots = ['ang', 'lat']
filtered_df = agg_df[top_roots]

# Normalize so rows sum to 1 (per translator)
prop_df = filtered_df.div(filtered_df.sum(axis=1), axis=0)

# Compare each translator to the mean distribution
mean_dist = prop_df.mean().values  # .values = 1D numpy array
prop_df['cosine_dist_from_mean'] = prop_df.apply(lambda row: cosine(row.values, mean_dist), axis=1)

# Print with full precision to catch small differences
with pd.option_context('display.float_format', '{:.10f}'.format):
    print(prop_df[['cosine_dist_from_mean']])


prop_df_sorted = prop_df.sort_values("cosine_dist_from_mean", ascending=False).reset_index()
prop_df_sorted.rename(columns={"index": "translator"}, inplace=True)

sns.barplot(
    data=prop_df_sorted.reset_index(),
    x="cosine_dist_from_mean", y="translator", 
    palette=palette,
    hue="translator", dodge=False,
)
plt.xlabel("Cosine Distance from Mean Distribution")
plt.title("SEtymology Stylistic Divergence in Anglo-saxon vs. Latin")
plt.tight_layout()
plt.savefig(f"{output_path_plots}Etymo_Divergence_Ang_Lat-{nb_id}.png", dpi=300, bbox_inches='tight')
plt.show()

            cosine_dist_from_mean
AT_Murray  0.0003544942          
Fitzgerald 0.0004376970          
Lattimore  0.0007190432          
Fagles     0.0000609730          
Wilson     0.0012118305          
Green      0.0006088202          
Woolf      0.0174467372

time: 718 ms (started: 2025-04-22 02:29:59 +02:00)

#-------------------------------------------------------------------
# Etymology-Weighted TF-IDF 
#-------------------------------------------------------------------

# Explode tokens column
translator_docs = odysseys_df.groupby("translator", sort=False)["tokens"]\
                             .apply(lambda x: " ".join([token for sublist in x for token in sublist]))

# Compute TF-IDF
tfidf = TfidfVectorizer(lowercase=False, tokenizer=lambda x: x.split()) 
tfidf_matrix = tfidf.fit_transform(translator_docs)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=translator_docs.index, columns=tfidf.get_feature_names_out())

etym_dict = {word: get_etymology("eng: " + word, relation_etymology_df)[-1] for word in tfidf_df.columns}

# Initialize etymology-weighted TF-IDF
etym_tfidf = pd.DataFrame(index=tfidf_df.index)

# Map each column (word) to its etym root
for root in set(etym_dict.values()):
    words_with_root = [w for w in tfidf_df.columns if etym_dict.get(w) == root]
    if words_with_root:
        etym_tfidf[root] = tfidf_df[words_with_root].sum(axis=1)

# Normalize the etymology-weighted TF-IDF
etym_tfidf_normalized = etym_tfidf.div(etym_tfidf.sum(axis=1), axis=0)

Warning: Cycle detected for 'eng: boom'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'lat: caballus'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'enm: cluster'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'lat: conductus'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'lat: construo'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'lat: cortina'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'lat: districtus'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'lat: educo'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'lat: faber'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'lat: femella'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'fra: mine'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'enm: moiste'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'enm: moiste'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'lat: piper'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'fra: plan'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'fra: plan'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'enm: povre'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'enm: povre'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'lat: privatus'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'lat: privatus'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'lat: privatus'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'ang: stille'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'lat: titulus'. Stopping to prevent infinite loop.
Warning: Cycle detected for 'lat: titulus'. Stopping to prevent infinite loop.
time: 11min 19s (started: 2025-04-22 02:30:00 +02:00)

#-------------------------------------------------------------------
# TF-IDF PCA with Scatterplot
#-------------------------------------------------------------------

pca = PCA(n_components=2)
coords = pca.fit_transform(etym_tfidf_normalized)

plt.figure(figsize=(8,8))
for i, translator in enumerate(etym_tfidf.index):
    plt.scatter(coords[i,0], coords[i,1], label=translator, s=160)
    #plt.text(coords[i,0]+0.01, coords[i,1]+0.01, translator)

plt.title("Translator Clustering by TF-IDF Etymological Preference")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(f"{output_path_plots}TF-IDF_PCA-{nb_id}.png", dpi=300, bbox_inches='tight')
plt.show()

time: 788 ms (started: 2025-04-22 02:41:19 +02:00)

#-------------------------------------------------------------------
# TF-IDF Clustering with cluster map
#-------------------------------------------------------------------

top_n = 100
mean_tfidf = etym_tfidf_normalized.mean(axis=0)
top_features = mean_tfidf.sort_values(ascending=False).head(top_n).index

reduced_tfidf = etym_tfidf_normalized[top_features]
scaled = pd.DataFrame(
    StandardScaler().fit_transform(reduced_tfidf),
    index=reduced_tfidf.index,
    columns=reduced_tfidf.columns
)

# Custom palette
top_value = '#214D72'
middle_value = '#50BFC3'
bottom_value = '#C8EEF0' 

# Create custom colormap
custom_cmap = LinearSegmentedColormap.from_list('neptune_blue', [bottom_value,  middle_value, top_value])
row_colors = reduced_tfidf.index.to_series().map(palette)
sns.clustermap(
    reduced_tfidf,
    metric="cosine",
    cmap=custom_cmap,
    standard_scale=1,
    row_colors=row_colors
)
plt.title("TF-IDF Etymological Preference Clustering")
plt.savefig(f"{output_path_plots}TF-IDF_Clustermap-{nb_id}.png", dpi=300, bbox_inches='tight')
plt.show()

time: 1.23 s (started: 2025-04-22 02:41:20 +02:00)

	word	relation	target_word
5463534	spa: condicional	rel:has_derived_form	spa: condicionales
467142	eng: albuminization	rel:etymologically_related	eng: toxalbumin
2075742	fra: insulteur	rel:etymologically_related	fra: insulteuse
4897623	lat: torculo	rel:has_derived_form	lat: torculavissent
255143	deu: anfangen	rel:has_derived_form	deu: anfingt
4675182	lat: prodicimini	rel:is_derived_from	lat: prodico
1992445	fra: enflai	rel:is_derived_from	fra: enfler
5773738	spa: sobrecargasen	rel:is_derived_from	spa: sobrecargar
4882662	lat: taedeo	rel:has_derived_form	lat: taedueritis
5357602	spa: aclaraseis	rel:is_derived_from	spa: aclarar

Relation	Description
rel:etymological_origin_of	Indicates that a word is the root or ancestor of another word. Example: Latin scientia is the etymological origin of English science.
rel:has_derived_form	Shows that a word has a derived variant. Example: happy has the derived form happiness.
rel:is_derived_from	Specifies that a word originates from another word. This is the inverse of rel:etymological_origin_of. Example: English science is derived from Latin scientia.
rel:etymology	A general relation indicating the etymology of a word without specifying a direction of derivation.
rel:etymologically_related	Indicates that two words share a common etymology but are not directly derived from one another. Example: English hospital and host are etymologically related through Latin hospes.
rel:variant:orthography	Refers to different spellings of the same word. Example: color (American English) vs. colour (British English).
rel:derived	A broad category that includes words that evolved from another language but does not specify the exact relationship.
rel:etymologically	A general tag used to indicate some form of etymological connection. Often used when the exact type of relation is unclear.

	word	relation	target_word
1727960	fin: räkänokka	rel:etymology	fin: nokka
753310	eng: futureless	rel:etymology	eng: future
842271	eng: inveracity	rel:etymology	eng: in-
804350	eng: homœostasis	rel:etymology	eng: homœ-
1622981	fin: huumekauppias	rel:etymology	fin: huume
404060	eng: Ceylon	rel:etymology	grc: Σελεδίβα
1063714	eng: pritumumab	rel:etymology	eng: -tum-
5429877	spa: cabezota	rel:etymology	spa: -ota

	word	etymology
385	ails	eng
8164	happiest	eng
8845	humility	lat
13113	pitt	eng
4032	crookedminded	eng
7755	granting	eng
718	apartment	eng
3947	cravings	eng

	translator	book_num	text	tokens	etymology	etymology_counts
137	Green	18	[Now there came up a public beggar, whose custom it was to beg\n, through the town of Ithake, well known for his ravenous belly,\n, forever guzzling and swilling. There was no strength in him,\n, ...	[came, public, beggar, whose, custom, beg, town, ithake, well, known, ravenous, belly, forever, guzzling, swilling, strength, force, yet, great, bulk, made, imposing, sight, name, arnaios, lady, m...	[eng, ang, eng, eng, eng, eng, eng, eng, eng, eng, enm, eng, non, eng, eng, ang, ang, ang, eng, eng, ang, eng, eng, eng, fro, ang, eng, lat, eng, enm, eng, heb, ang, enm, ang, heb, eng, eng, eng, ...	{'eng': 836, 'ang': 351, 'enm': 275, 'non': 29, 'fro': 45, 'lat': 99, 'heb': 7, 'xno': 9, 'san': 2, 'fra': 15, 'grc': 8, 'p_g': 2, 'cym': 1, 'frm': 1, 'ara': 1, 'tur': 1, 'ita': 2, 'spa': 1}
30	Fitzgerald	7	[As Lord Odysseus prayed there in the grove \n, the girl rode on, behind her strapping team, \n, and came late to the mansion of her father, \n, where she reined in at the courtyard gate. Her brot...	[lord, odysseus, prayed, grove, girl, rode, behind, strapping, team, came, late, mansion, father, reined, courtyard, gate, brothers, awaited, like, tall, gods, court, circling, lead, mules, away, ...	[eng, ang, enm, eng, eng, ang, eng, eng, eng, eng, ang, lat, eng, ang, lat, eng, eng, eng, eng, eng, fro, eng, ang, eng, eng, eng, ang, eng, enm, eng, eng, ang, ang, ang, ang, enm, enm, ang, eng, ...	{'eng': 921, 'ang': 408, 'enm': 270, 'lat': 107, 'fro': 45, 'frm': 7, 'xno': 14, 'non': 21, 'grc': 17, 'msa': 1, 'cym': 1, 'fra': 7, 'p_g': 2, 'goh': 1, 'tur': 1, 'fin': 2, 'ita': 1, 'heb': 1, 'po...
119	Wilson	24	[Then Hermes called the spirits of the suitors\n, out of the house. He held the golden wand\n, with which he casts a spell to close men’s eyes\n, or open those of sleepers when he wants.\n, He led...	[hermes, called, spirits, suitors, house, held, golden, wand, casts, spell, close, men, eyes, open, sleepers, wants, led, spirits, followed, squeaking, like, bats, secret, crannies, cave, cling, t...	[eng, eng, eng, eng, eng, enm, ang, eng, eng, eng, enm, ang, eng, eng, eng, eng, eng, eng, enm, eng, eng, eng, eng, eng, eng, eng, eng, enm, ang, lat, eng, eng, eng, ang, fro, eng, eng, eng, eng, ...	{'eng': 1382, 'enm': 396, 'ang': 688, 'lat': 144, 'fro': 70, 'grc': 19, 'arg': 1, 'xno': 17, 'fra': 17, 'ara': 1, 'non': 37, 'cym': 5, nan: 2, 'p_g': 2, 'msa': 1, 'frm': 3, 'gml': 1, 'heb': 1, 'it...
29	Fitzgerald	6	[Far gone in weariness, in oblivion, \n, the noble and enduring man slept on; \n, but Athena in the night went down the land \n, of the Phaiakians, entering their city. \n, In days gone by, these ...	[far, gone, weariness, oblivion, noble, enduring, man, slept, athena, night, went, land, phaiakians, entering, city, days, gone, men, held, hypereia, country, wide, dancing, grounds, near, overbea...	[fro, ang, eng, ang, eng, eng, ang, eng, enm, ang, ang, eng, eng, fro, eng, eng, lat, eng, enm, eng, enm, eng, eng, eng, lat, eng, eng, eng, lat, eng, eng, eng, enm, eng, enm, eng, eng, enm, lat, ...	{'fro': 48, 'ang': 399, 'eng': 802, 'enm': 256, 'lat': 104, 'frm': 5, 'xno': 15, 'grc': 18, 'fra': 11, 'dum': 8, 'non': 19, nan: 2, 'tur': 1, 'por': 1, 'heb': 1, 'p_g': 1, 'cym': 1}
142	Green	23	[Chuckling, the old woman ascended to the upper chamber,\n, to bring her mistress the news that her dear husband was there,\n, in the house: her feet hobbled, but her knees moved briskly,\n, and s...	[chuckling, old, woman, ascended, upper, chamber, bring, mistress, news, dear, husband, house, feet, hobbled, knees, moved, briskly, stood, penelope, head, addressed, saying, wake, penelope, dear,...	[eng, ang, enm, eng, eng, eng, ang, enm, eng, ang, enm, non, enm, eng, ang, eng, eng, eng, eng, eng, eng, eng, enm, eng, ang, eng, ang, lat, ang, ang, eng, ang, enm, ang, eng, enm, ang, eng, enm, ...	{'eng': 700, 'ang': 363, 'enm': 216, 'non': 13, 'lat': 83, 'fro': 29, 'xno': 7, 'spa': 1, 'fra': 7, 'grc': 18, 'arg': 1, 'heb': 3, 'dum': 1, 'frk': 1}
161	Woolf	18	[Everything he saw was distasteful to him. He hated the blue and white,\n, the intensity and definiteness, the hum and heat of the south; the\n, landscape seemed to him as hard and as romantic as ...	[everything, saw, distasteful, hated, blue, white, intensity, definiteness, hum, heat, south, landscape, seemed, hard, romantic, cardboard, background, stage, mountain, wooden, screen, sheet, pain...	[eng, ang, eng, eng, enm, ang, eng, eng, eng, ang, eng, eng, eng, ang, fro, eng, eng, lat, eng, ang, xno, eng, eng, enm, eng, ang, eng, ang, ang, eng, eng, eng, ang, p_g, eng, ang, eng, lat, eng, ...	{'eng': 376, 'ang': 172, 'enm': 101, 'fro': 17, 'lat': 91, 'xno': 14, 'p_g': 1, 'non': 7, 'grc': 6, 'heb': 1, 'frm': 2, 'gml': 1, 'gmh': 1, 'goh': 1}

Lexical of 20th century Odyssey translations (Part B): Etymologies¶

Road Map¶

Approaching the Etymologies of the Odyssey¶

**2. The Dictionaries¶

Working with ety¶

Developing a dictionary we can use¶

Designing our dictionary-look up¶

Step 1. Observations on Melo’s Dictionary and Etymological Pitfalls¶

Step 2. Building an etymologycal lexicon¶

Word-to-Etymology Map¶

Root Frequencies¶

Etymology Bias Analysis in Odyssey Translations¶

Overall Etymology Distribution¶

Anglo-Saxon vs Latin Preferences¶

Normalized Comparison Against the Mean¶

Overall Etymology¶

Latin and Anglo-Saxon Only¶

TF-IDF Weighted Analysis¶

Cluster Analysis Results¶

	0	1	2	3	4	5	6	7
word	spear	muse	shield	abandoning	abeam	warrior	goddess	hero
etymology	[ang: spere]	[grc: Μοῦσα]	[ang: scield]	[eng: abandoning]	[ang: byme]	[eng: warrior]	[eng: god]	[eng: hero]