-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprepare.py
More file actions
105 lines (71 loc) · 4.07 KB
/
prepare.py
File metadata and controls
105 lines (71 loc) · 4.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os
import pandas as pd
import yaml
import csv # Required for proper quoting in CSV
from src.config import PROJECT_ROOT
from src.log import log_action
def process_csv():
# Load settings
with open("settings.yml", "r") as file:
settings = yaml.safe_load(file)
source = settings["source"]["dir"]
source_dir = os.path.join(PROJECT_ROOT, source)
source_file = os.path.join(source_dir, "source.csv")
artists_file = os.path.join(source_dir, "artists.csv") # Load artists.csv from the same location
# Load source CSV
df = pd.read_csv(source_file)
# Load artists.csv
artists_df = pd.read_csv(artists_file)
# Ensure Artist Code is a 4-character string (preserve leading zeros)
artists_df["Artist Code"] = artists_df["Artist Code"].astype(str).str.zfill(4)
# Ensure parent is a string and extract the first 4 characters for matching
df["artist_code"] = df["parent"].astype(str).str[:4]
# Merge to get the artist's Name by matching artist_code to Artist Code
merged_df = df.merge(artists_df, left_on="artist_code", right_on="Artist Code", how="left")
# Create items.csv (Unique parent, title, and Name)
items_df = merged_df[["parent", "title", "name"]].drop_duplicates()
# Fix duplicate entries by keeping the row where `title` is **not empty**
items_df = items_df.sort_values(by="title", na_position="last").drop_duplicates(subset=["parent"], keep="first")
# Create files.csv (parent, code)
files_df = df[["parent", "code"]].drop_duplicates()
#### Either keep or remove video file types ####
video_file_types = (".mp4", ".mov", ".avi", ".mkv", ".flv", ".wmv", ".mpeg", ".mpg")
# 🔽 Filter out video file types (e.g., .mp4, .mov, .avi)
files_df = files_df[~files_df["code"].str.endswith(video_file_types)]
# 🔽 Includes only video file types (e.g., .mp4, .mov, .avi)
# files_df = files_df[files_df["code"].str.endswith(video_file_types)]
#################################################
# Identify all columns that start with 'code_' (excluding 'code')
code_columns = [col for col in df.columns if col.startswith("code_")]
# Create metadata.csv (parent, title, and separate code columns)
metadata_df = df.groupby(["parent"])[code_columns].first().reset_index()
# Remove items_df that are not in the files_df
items_df = items_df[items_df["parent"].isin(files_df["parent"])]
# Remove metadata_df that are not in the files_df
metadata_df = metadata_df[metadata_df["parent"].isin(files_df["parent"])]
# Combine all code columns into single column as some array
metadata_codes = metadata_df[code_columns].apply(lambda x: [i for i in x if pd.notnull(i)], axis=1)
# Remove duplicates and empty values
metadata_codes = metadata_codes.apply(lambda x: list(set(x)))
# Remove Interviews from metadata_codes
metadata_codes = metadata_codes.apply(lambda x: [i for i in x if "Interviews" not in i])
# Remove the original code columns
metadata_df = metadata_df.drop(columns=code_columns)
# Add the combined codes as a new column
metadata_df["codes"] = metadata_codes.apply(lambda x: "|".join(x))
print(f"{metadata_df}")
# Save items.csv WITHOUT headers
items_csv_path = os.path.join(source_dir, "items.csv")
items_df.to_csv(items_csv_path, index=False, header=False, quoting=csv.QUOTE_MINIMAL)
log_action(f"{items_csv_path} generated successfully.", "success")
print("✅ items.csv generated successfully (without headers)")
files_csv_path = os.path.join(source_dir, "files.csv")
files_df.to_csv(files_csv_path, index=False, quoting=csv.QUOTE_MINIMAL)
log_action(f"{files_csv_path} generated successfully.", "success")
print("✅ files.csv generated successfully!")
metadata_csv_path = os.path.join(source_dir, "metadata.csv")
metadata_df.to_csv(metadata_csv_path, index=False, quoting=csv.QUOTE_MINIMAL)
log_action(f"{metadata_csv_path} generated successfully.", "success")
print("✅ metadata.csv generated successfully!")
if __name__ == "__main__":
process_csv()