jlcpcb-parts-database/generate-database.py at main · CDFER/jlcpcb-parts-database · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import pandas as pd
import sqlite3
import os
from datetime import datetime, timezone

os.chdir("db_build")

# Rename the database file
os.rename("cache.sqlite3", f"jlcpcb-components.sqlite3")

initial_db_size = os.path.getsize("jlcpcb-components.sqlite3")
print(f"Initial SQLite Database Size: {initial_db_size / (1024 ** 3):.2f} GiB")

conn = sqlite3.connect("jlcpcb-components.sqlite3")
cur = conn.cursor()

cur.execute("PRAGMA journal_mode = WAL")  # Enable Write-Ahead Logging (WAL) for improved performance and concurrency
cur.execute("PRAGMA synchronous = NORMAL")  # Set the synchronous mode to NORMAL, which balances safety and performance
cur.execute("PRAGMA temp_store = MEMORY")  # Store temporary tables and indices in memory for faster access
cur.execute("PRAGMA mmap_size = 536870912")  # Set the maximum memory map size to 512MiB

# Delete components with low stock
cur.execute("DELETE FROM components WHERE stock < 5;")
conn.commit()
print(f"Deleted {cur.rowcount} components with low stock")

# Create an FTS (Full-Text Search) index on multiple columns (helps to speed up searching the database)
cur.execute(
    """
    CREATE VIRTUAL TABLE components_fts USING fts5(
        lcsc,
        mfr,
        package,
        description,
        datasheet,
        content='components'
    );
"""
)
conn.commit()

# Reindex database to reduce file size
cur.execute("REINDEX;")
conn.commit()

# Vacuum database to reduce file size
cur.execute("VACUUM;")
conn.commit()

# Load Scraped Components List
file_location = os.path.join("..", os.path.join("scraped", "ComponentList.csv"))
df = pd.read_csv(file_location)

# Convert date columns to datetime with UTC timezone
df["First Seen"] = pd.to_datetime(df["First Seen"], format="%Y/%m/%d", utc=True)
df["Last Seen"] = pd.to_datetime(df["Last Seen"], format="%Y/%m/%d", utc=True)

# Calculate time differences
now = datetime.now(timezone.utc)
df["Days Since First Seen"] = (now - df["First Seen"]).dt.days
df["Days Since Last Seen"] = (now - df["Last Seen"]).dt.days

# Filter components
component_codes = df[(df["Days Since First Seen"] >= 1) & (df["Days Since Last Seen"] < 2)]["lcsc"].astype(int).tolist()

preferred_parts_corrected = 0
for code in component_codes:
    cur.execute("SELECT 1 FROM components WHERE lcsc = ?", (code,))
    if cur.fetchone():
        cur.execute(
            "UPDATE components SET preferred = 1 WHERE lcsc = ? AND basic = 0 AND preferred = 0",
            (code,),
        )
        conn.commit()
        preferred_parts_corrected += 1

print(f"Preferred Parts Corrected: {preferred_parts_corrected}")

optimized_db_size = os.path.getsize("jlcpcb-components.sqlite3")
print(f"Optimized Database Size: {optimized_db_size / (1024 ** 3):.2f} GiB")

# Retrieve basic/preferred components ($0 for loading feeders) and exclude "0201" package
cur.execute(
    """
    SELECT * FROM v_components
    WHERE (basic > 0 OR preferred > 0) AND package != '0201';
"""
)
filtered_components = cur.fetchall()

# Create Pandas DataFrame
df_sorted = pd.DataFrame(filtered_components, columns=[desc[0] for desc in cur.description])

# Merge assembly details
file_location = os.path.join("..", os.path.join("scraped", "assembly-details.csv"))
df = pd.read_csv(file_location)

df_filtered = df[df["lcsc"].isin(df_sorted["lcsc"])]

df_sorted = pd.merge(
    df_sorted, df_filtered[["lcsc", "Assembly Process", "Min Order Qty", "Attrition Qty"]], on="lcsc", how="right"
)

df_sorted = df_sorted.sort_values(by=["category", "subcategory", "package"])

# Remove parts with missing price fields
df_sorted = df_sorted.drop(df_sorted[df_sorted["price"] == "[]"].index)

# Save sorted DataFrame to CSV
df_sorted.to_csv("jlcpcb-components-basic-preferred.csv", index=False, header=True)

cur.execute("PRAGMA analyze")  # Update statistics for the query planner to improve query performance
cur.execute("PRAGMA optimize")  # Perform various optimizations, such as reindexing and refreshing views
conn.close()