-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgenerate-database.py
More file actions
114 lines (89 loc) · 3.88 KB
/
generate-database.py
File metadata and controls
114 lines (89 loc) · 3.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import pandas as pd
import sqlite3
import os
from datetime import datetime, timezone
os.chdir("db_build")
# Rename the database file
os.rename("cache.sqlite3", f"jlcpcb-components.sqlite3")
initial_db_size = os.path.getsize("jlcpcb-components.sqlite3")
print(f"Initial SQLite Database Size: {initial_db_size / (1024 ** 3):.2f} GiB")
conn = sqlite3.connect("jlcpcb-components.sqlite3")
cur = conn.cursor()
cur.execute("PRAGMA journal_mode = WAL") # Enable Write-Ahead Logging (WAL) for improved performance and concurrency
cur.execute("PRAGMA synchronous = NORMAL") # Set the synchronous mode to NORMAL, which balances safety and performance
cur.execute("PRAGMA temp_store = MEMORY") # Store temporary tables and indices in memory for faster access
cur.execute("PRAGMA mmap_size = 536870912") # Set the maximum memory map size to 512MiB
# Delete components with low stock
cur.execute("DELETE FROM components WHERE stock < 5;")
conn.commit()
print(f"Deleted {cur.rowcount} components with low stock")
# Create an FTS (Full-Text Search) index on multiple columns (helps to speed up searching the database)
cur.execute(
"""
CREATE VIRTUAL TABLE components_fts USING fts5(
lcsc,
mfr,
package,
description,
datasheet,
content='components'
);
"""
)
conn.commit()
# Reindex database to reduce file size
cur.execute("REINDEX;")
conn.commit()
# Vacuum database to reduce file size
cur.execute("VACUUM;")
conn.commit()
# Load Scraped Components List
file_location = os.path.join("..", os.path.join("scraped", "ComponentList.csv"))
df = pd.read_csv(file_location)
# Convert date columns to datetime with UTC timezone
df["First Seen"] = pd.to_datetime(df["First Seen"], format="%Y/%m/%d", utc=True)
df["Last Seen"] = pd.to_datetime(df["Last Seen"], format="%Y/%m/%d", utc=True)
# Calculate time differences
now = datetime.now(timezone.utc)
df["Days Since First Seen"] = (now - df["First Seen"]).dt.days
df["Days Since Last Seen"] = (now - df["Last Seen"]).dt.days
# Filter components
component_codes = df[(df["Days Since First Seen"] >= 1) & (df["Days Since Last Seen"] < 2)]["lcsc"].astype(int).tolist()
preferred_parts_corrected = 0
for code in component_codes:
cur.execute("SELECT 1 FROM components WHERE lcsc = ?", (code,))
if cur.fetchone():
cur.execute(
"UPDATE components SET preferred = 1 WHERE lcsc = ? AND basic = 0 AND preferred = 0",
(code,),
)
conn.commit()
preferred_parts_corrected += 1
print(f"Preferred Parts Corrected: {preferred_parts_corrected}")
optimized_db_size = os.path.getsize("jlcpcb-components.sqlite3")
print(f"Optimized Database Size: {optimized_db_size / (1024 ** 3):.2f} GiB")
# Retrieve basic/preferred components ($0 for loading feeders) and exclude "0201" package
cur.execute(
"""
SELECT * FROM v_components
WHERE (basic > 0 OR preferred > 0) AND package != '0201';
"""
)
filtered_components = cur.fetchall()
# Create Pandas DataFrame
df_sorted = pd.DataFrame(filtered_components, columns=[desc[0] for desc in cur.description])
# Merge assembly details
file_location = os.path.join("..", os.path.join("scraped", "assembly-details.csv"))
df = pd.read_csv(file_location)
df_filtered = df[df["lcsc"].isin(df_sorted["lcsc"])]
df_sorted = pd.merge(
df_sorted, df_filtered[["lcsc", "Assembly Process", "Min Order Qty", "Attrition Qty"]], on="lcsc", how="right"
)
df_sorted = df_sorted.sort_values(by=["category", "subcategory", "package"])
# Remove parts with missing price fields
df_sorted = df_sorted.drop(df_sorted[df_sorted["price"] == "[]"].index)
# Save sorted DataFrame to CSV
df_sorted.to_csv("jlcpcb-components-basic-preferred.csv", index=False, header=True)
cur.execute("PRAGMA analyze") # Update statistics for the query planner to improve query performance
cur.execute("PRAGMA optimize") # Perform various optimizations, such as reindexing and refreshing views
conn.close()