|
19 | 19 |
|
20 | 20 | # Automation of import to sqlite3 database |
21 | 21 | def auto_import_uploads(): |
| 22 | + """ |
| 23 | + Import CSV/NPY datasets from uploads/<property>/<dataset>/ into SQLite tables. |
| 24 | + - Skips music under uploads/clips/ |
| 25 | + - Does NOT write to uploads_log (logging handled elsewhere) |
| 26 | + - Re-imports only when source file mtime changed (tracked in import_etag) |
| 27 | + """ |
22 | 28 | if not os.path.exists(UPLOAD_FOLDER): |
23 | | - return |
| 29 | + print("auto_import_uploads: uploads/ folder not found, skipping.") |
| 30 | + return 0 |
24 | 31 |
|
25 | | - for root, dirs, files in os.walk(UPLOAD_FOLDER): |
26 | | - for filename in files: |
27 | | - ext = filename.rsplit('.', 1)[1].lower() |
28 | | - if ext not in ['csv', 'npy']: |
29 | | - continue |
| 32 | + ALLOWED_IMPORT_EXTS = {'csv', 'npy'} |
| 33 | + imported = 0 |
30 | 34 |
|
31 | | - filepath = os.path.join(root, filename) |
32 | | - table_name = filename.replace('.', '_').replace('-', '_').replace('/', '_').replace('\\', '_') |
| 35 | + def tableize(name: str) -> str: |
| 36 | + # Stable, safe table name from filename only (not full path) |
| 37 | + # e.g. "bandgap.csv" -> "bandgap_csv" |
| 38 | + t = name.replace('.', '_').replace('-', '_').replace(' ', '_') |
| 39 | + return re.sub(r'[^0-9a-zA-Z_]', '_', t) |
33 | 40 |
|
34 | | - try: |
35 | | - # Load data |
36 | | - if ext == 'csv': |
37 | | - df = pd.read_csv(filepath) |
38 | | - elif ext == 'npy': |
39 | | - arr = np.load(filepath, allow_pickle=True) |
40 | | - if isinstance(arr, np.ndarray): |
41 | | - if arr.ndim == 2: |
42 | | - df = pd.DataFrame(arr) |
43 | | - elif arr.ndim == 1 and hasattr(arr[0], 'dtype') and arr[0].dtype.names: |
44 | | - df = pd.DataFrame(arr) |
45 | | - else: |
46 | | - df = pd.DataFrame(arr) |
47 | | - else: |
48 | | - continue # unsupported NPY format |
49 | | - else: |
| 41 | + with sqlite3.connect(DB_NAME) as conn: |
| 42 | + c = conn.cursor() |
| 43 | + # Track file mtimes to avoid unnecessary re-imports |
| 44 | + c.execute(""" |
| 45 | + CREATE TABLE IF NOT EXISTS import_etag ( |
| 46 | + relpath TEXT PRIMARY KEY, |
| 47 | + mtime REAL NOT NULL |
| 48 | + ) |
| 49 | + """) |
| 50 | + conn.commit() |
| 51 | + |
| 52 | + for root, _, files in os.walk(UPLOAD_FOLDER): |
| 53 | + # Skip music tree |
| 54 | + rel_root = os.path.relpath(root, UPLOAD_FOLDER) |
| 55 | + if rel_root.split(os.sep)[0] == 'clips': |
| 56 | + continue |
| 57 | + |
| 58 | + for filename in files: |
| 59 | + if filename.startswith('.'): |
| 60 | + continue |
| 61 | + ext = filename.rsplit('.', 1)[-1].lower() |
| 62 | + if ext not in ALLOWED_IMPORT_EXTS: |
50 | 63 | continue |
51 | 64 |
|
52 | | - # Write to SQLite |
53 | | - with sqlite3.connect(DB_NAME) as conn: |
54 | | - df.to_sql(table_name, conn, if_exists='replace', index=False) |
| 65 | + filepath = os.path.join(root, filename) |
| 66 | + relpath = os.path.relpath(filepath, UPLOAD_FOLDER) |
| 67 | + mtime = os.path.getmtime(filepath) |
| 68 | + table_name = tableize(filename) |
55 | 69 |
|
56 | | - print(f"Imported: {filename} as table '{table_name}'") |
| 70 | + # Check etag (mtime) |
| 71 | + c.execute("SELECT mtime FROM import_etag WHERE relpath=?", (relpath,)) |
| 72 | + row = c.fetchone() |
| 73 | + if row and float(row[0]) == float(mtime): |
| 74 | + # up-to-date, skip |
| 75 | + continue |
57 | 76 |
|
58 | | - # Auto-log into uploads_log if possible |
59 | | - rel_path = os.path.relpath(filepath, UPLOAD_FOLDER) |
60 | | - parts = rel_path.split(os.sep) |
| 77 | + # Load into DataFrame |
| 78 | + try: |
| 79 | + if ext == 'csv': |
| 80 | + df = pd.read_csv(filepath) |
| 81 | + else: # npy |
| 82 | + arr = np.load(filepath, allow_pickle=True) |
| 83 | + if isinstance(arr, np.ndarray): |
| 84 | + if arr.ndim == 2: |
| 85 | + df = pd.DataFrame(arr) |
| 86 | + elif arr.ndim == 1 and hasattr(arr.dtype, 'names') and arr.dtype.names: |
| 87 | + # structured array -> DataFrame with named columns |
| 88 | + df = pd.DataFrame(arr.tolist(), columns=list(arr.dtype.names)) |
| 89 | + else: |
| 90 | + df = pd.DataFrame(arr) |
| 91 | + else: |
| 92 | + print(f"auto_import_uploads: unsupported NPY structure for {relpath}, skipping.") |
| 93 | + continue |
| 94 | + except Exception as e: |
| 95 | + print(f"auto_import_uploads: failed to read {relpath}: {e}") |
| 96 | + continue |
61 | 97 |
|
62 | | - if len(parts) >= 3: |
63 | | - property_name = parts[0] |
64 | | - tab = parts[1] |
65 | | - file_name = parts[2] |
66 | | - with sqlite3.connect(DB_NAME) as conn: |
67 | | - c = conn.cursor() |
68 | | - c.execute(""" |
69 | | - INSERT OR IGNORE INTO uploads_log (property, tab, filename, uploaded_at) |
70 | | - VALUES (?, ?, ?, ?) |
71 | | - """, (property_name, tab, file_name, datetime.datetime.now().isoformat())) |
72 | | - conn.commit() |
73 | | - print(f"Logged {file_name} to uploads_log.") |
74 | | - else: |
75 | | - print(f"Skipped logging for {filename} (not in expected folder structure).") |
| 98 | + # Import into SQLite (replace whole table) |
| 99 | + try: |
| 100 | + df.to_sql(table_name, conn, if_exists='replace', index=False) |
| 101 | + c.execute("REPLACE INTO import_etag (relpath, mtime) VALUES (?, ?)", (relpath, mtime)) |
| 102 | + conn.commit() |
| 103 | + imported += 1 |
| 104 | + print(f"auto_import_uploads: imported {relpath} -> table '{table_name}'") |
| 105 | + except Exception as e: |
| 106 | + print(f"auto_import_uploads: failed to import {relpath} to '{table_name}': {e}") |
76 | 107 |
|
77 | | - except Exception as e: |
78 | | - print(f"Failed to import {filename}: {e}") |
| 108 | + print(f"auto_import_uploads: done, {imported} table(s) updated.") |
| 109 | + return imported |
79 | 110 |
|
80 | 111 | def auto_log_material_files(): |
81 | 112 | if not os.path.exists(UPLOAD_FOLDER): |
82 | 113 | return |
83 | 114 |
|
84 | | - all_allowed_exts = ALLOWED_DATASET_EXTENSIONS | ALLOWED_RESULTS_EXTENSIONS | ALLOWED_MUSIC_EXTENSIONS |
| 115 | + all_allowed_exts = ALLOWED_DATASET_EXTENSIONS | ALLOWED_RESULTS_EXTENSIONS |
| 116 | + |
| 117 | + with sqlite3.connect(DB_NAME) as conn: |
| 118 | + c = conn.cursor() |
| 119 | + # Ensure uniqueness constraint exists |
| 120 | + c.execute(""" |
| 121 | + CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_uploads |
| 122 | + ON uploads_log(property, tab, filename) |
| 123 | + """) |
| 124 | + conn.commit() |
85 | 125 |
|
86 | 126 | for root, dirs, files in os.walk(UPLOAD_FOLDER): |
87 | 127 | for filename in files: |
|
0 commit comments