-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
49 lines (39 loc) · 1.32 KB
/
preprocessing.py
File metadata and controls
49 lines (39 loc) · 1.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import pandas as pd
import json
items_file = "data/5k_items_curated.csv"
def build_image_url(image_str):
return f"https://static.ifood-static.com.br/image/upload/t_low/pratos/{image_str}"
df = pd.read_csv(items_file)
cleaned_items = []
for _, row in df.iterrows():
item_id = row["itemId"]
# parse json strings
try:
metadata = json.loads(row["itemMetadata"])
except Exception:
continue
name = metadata.get("name", "")
category_name = metadata.get("category_name", "")
description = metadata.get("description", "")
taxonomy = metadata.get("taxonomy", {})
taxonomy_levels = [
taxonomy.get("l0",""),
taxonomy.get("l1",""),
taxonomy.get("l2","")
]
images = [build_image_url(image) for image in metadata.get("images", [])]
''' build search key text'''
search_key = " ".join(
filter(None, [name, category_name, description] + taxonomy_levels)
)
cleaned_items.append({
"item_id": item_id,
"name": name,
"category_name": category_name,
"description": description,
"taxonomy_levels": taxonomy_levels,
"search_key": search_key,
"images": images
})
cleaned_df = pd.DataFrame(cleaned_items)
cleaned_df.to_csv("data/5k_items_cleaned.csv", index=False)