-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcharacter_onstage_kmeans.py
More file actions
executable file
·72 lines (59 loc) · 2.98 KB
/
character_onstage_kmeans.py
File metadata and controls
executable file
·72 lines (59 loc) · 2.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.manifold import MDS
import plotly.express as px
# Function to compute similarity between characters
def compute_similarity(char1, char2, df):
return sum((df.loc[char1].astype(bool)) & (df.loc[char2].astype(bool)))
# Load the CSV file with the play titles
metadata_dir = "metadata"
plays_df = pd.read_csv(os.path.join(metadata_dir, "list_of_shakespeare_plays.csv"), header=None)
# Create a mapping dictionary { 'H8_onstage.csv': 'Henry VIII', 'Ham_onstage.csv': 'Hamlet', ... }
play_title_mapping = dict(zip(plays_df[0].str.replace('.xml', '') + '_onstage.csv', plays_df[1]))
# Function to generate the output filename based on the CSV filename
def get_output_filename(csv_filename):
# Get the full play title from the mapping
full_play_title = play_title_mapping.get(csv_filename, "").replace(" ", "_")
if not full_play_title:
raise ValueError(f"No title mapping found for {csv_filename}")
# Append the required suffix to the play title
return f"{full_play_title}_onstage_kmeans.html"
# Directories
input_directory = 'output_onstage'
output_directory = 'output_onstage_kmeans'
# Create output directory if it does not exist
if not os.path.exists(output_directory):
os.makedirs(output_directory)
# Process each CSV file in the input directory
for filename in os.listdir(input_directory):
if filename.endswith("_onstage.csv"):
filepath = os.path.join(input_directory, filename)
df = pd.read_csv(filepath, index_col=0)
# Compute similarity matrix
similarity_matrix = np.zeros((len(df), len(df)))
for i, char1 in enumerate(df.index):
for j, char2 in enumerate(df.index):
similarity_matrix[i, j] = compute_similarity(char1, char2, df)
# Hierarchical clustering
link = linkage(1 - similarity_matrix, method='complete')
clusters = fcluster(link, 2, criterion='maxclust')
# Dimensionality reduction for 2D visualization
embedding = MDS(n_components=2, dissimilarity='precomputed', random_state=42, normalized_stress=False)
transformed = embedding.fit_transform(1 - similarity_matrix)
# Plotly visualization
fig = px.scatter(x=transformed[:, 0], y=transformed[:, 1], color=clusters, text=df.index)
fig.update_traces(marker=dict(size=15, opacity=0.8, line=dict(width=2, color='DarkSlateGrey')),
textposition='top center',
selector=dict(mode='markers+text'))
# Generate the output filename using the mapping
try:
output_filename = get_output_filename(filename)
except ValueError as e:
print(e)
continue
output_filepath = os.path.join(output_directory, output_filename)
fig.write_html(output_filepath)
# Print message when all files are processed
print("All files processed and saved to", output_directory)