thezfz
diff --git a/‎Containerfile‎
Lines changed: 5 additions & 12 deletions b/‎Containerfile‎
Lines changed: 5 additions & 12 deletions
diff --git a/‎aigraphx/repositories/neo4j_repo.py‎
Lines changed: 17 additions & 1 deletion b/‎aigraphx/repositories/neo4j_repo.py‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎compose.yml‎
Lines changed: 1 addition & 1 deletion b/‎compose.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎data/faiss_index.bin‎
1.64 MB b/‎data/faiss_index.bin‎
1.64 MB
diff --git a/‎data/models_faiss.index‎
7.39 MB b/‎data/models_faiss.index‎
7.39 MB
diff --git a/‎data/models_faiss_ids.json‎
Lines changed: 1 addition & 1 deletion b/‎data/models_faiss_ids.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎data/papers_faiss_ids.json‎
Lines changed: 1 addition & 1 deletion b/‎data/papers_faiss_ids.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎data/pg_load_checkpoint.txt‎
Lines changed: 1 addition & 1 deletion b/‎data/pg_load_checkpoint.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/load_postgres.py‎
Lines changed: 79 additions & 43 deletions b/‎scripts/load_postgres.py‎
Lines changed: 79 additions & 43 deletions
@@ -32,18 +32,11 @@ WORKDIR /app
 # Copy environment definition file first
 COPY environment.yml environment.yml
 
-# Create Conda environment from environment.yml (excluding pip for now)
-RUN echo "Creating Conda environment AIGraphX (conda parts)..." && \
-    conda env create -f environment.yml --name AIGraphX 
-
-# Extract pip requirements to a temporary file
-RUN echo "Extracting pip requirements..." && \
-    grep -A 100 -- '- pip:' environment.yml | tail -n +2 | sed 's/^- *//' | sed 's/"//g' > /tmp/requirements_pip.txt
-
-# Install pip dependencies using the temporary file inside the Conda env
-RUN echo "Installing pip dependencies..." && \
-    conda run -n AIGraphX pip install --no-cache-dir -r /tmp/requirements_pip.txt && \
-    rm /tmp/requirements_pip.txt # Clean up temporary file
+# Create Conda environment from environment.yml (Handles both Conda and Pip deps)
+RUN echo "Creating Conda environment AIGraphX from environment.yml..." && \
+    conda env create -f environment.yml --name AIGraphX && \
+    # Clean up conda caches after creating environment
+    conda clean -afy
 
 # --- Verification Step (depends on environment install) ---
 RUN echo "Verifying uvicorn installation in AIGraphX environment..." && \
 
@@ -81,7 +81,7 @@ async def _execute_query(
             logger.error("Neo4j driver not available or invalid in _execute_query")
             raise ConnectionError("Neo4j driver is not available.")
 
-        async with self.driver.session() as session:
+        async with self.driver.session(database=self.db_name) as session:
             try:
                 # Use execute_write for automatic transaction management
                 await session.execute_write(lambda tx: tx.run(query, parameters))
@@ -95,6 +95,22 @@ async def _execute_query(
                 logger.error(f"Parameters: {parameters}")
                 raise  # Re-raise to indicate failure
 
+    async def reset_database(self) -> None:
+        """Clears all nodes and relationships from the Neo4j database."""
+        if not self.driver or not hasattr(self.driver, "session"):  # Basic check
+            logger.error("Neo4j driver not available or invalid in reset_database")
+            raise ConnectionError("Neo4j driver is not available.")
+
+        logger.warning("Executing query to delete all nodes and relationships...")
+        query = "MATCH (n) DETACH DELETE n"
+        try:
+            # Use _execute_query to handle the transaction
+            await self._execute_query(query)
+            logger.info("Successfully cleared the Neo4j database.")
+        except Exception as e:
+            logger.error(f"Failed to clear Neo4j database: {e}")
+            raise  # Re-raise after logging
+
     async def create_or_update_paper_node(
         self, pwc_id: str, title: Optional[str] = None
     ) -> None:
 
@@ -51,7 +51,7 @@ services:
     volumes:
       - .:/app:z # ':z' is important for SELinux systems like Fedora
       # Mount data and logs directories as named volumes
-      - aigraphx_data:/app/data
+      - ./data:/app/data
       - aigraphx_logs:/app/logs
     # Define dependencies: The app service depends on the database services
     depends_on:
 
@@ -1 +1 @@
-3429
+5044
@@ -295,8 +295,9 @@ async def insert_pwc_relation(
     """Inserts related items (tasks, datasets) for a paper."""
     if not items:
         return
-    table_name = f"pwc_{relation_type}"  # e.g., pwc_tasks, pwc_datasets
-    column_name = f"{relation_type[:-1]}_name"  # e.g., task_name, dataset_name
+    # Correctly generate table and column names
+    table_name = f"pwc_{relation_type}s" # e.g., pwc_tasks, pwc_datasets
+    column_name = f"{relation_type}_name" # e.g., task_name, dataset_name
 
     # Prepare data tuples: (paper_id, item_name)
     data_tuples = [(paper_id, item) for item in items]
@@ -372,69 +373,104 @@ async def insert_model_paper_link(
 async def process_batch(
     conn: psycopg.AsyncConnection, batch: List[Tuple[int, Dict[str, Any]]]
 ) -> int:
-    """Processes a batch of model data within a single transaction."""
+    """Processes a batch of records within a single transaction."""
     processed_in_batch = 0
-    # Start a transaction for the batch
+    successful_lines_in_batch = 0 # Track successful lines within the batch
     async with conn.transaction():
-        for line_num, model_record in batch:
-            hf_model_id = model_record.get("hf_model_id")
-            if not hf_model_id:
-                logger.warning(f"Skipping line {line_num}: Missing 'hf_model_id'.")
-                continue
-
+        logger.debug(f"Starting transaction for batch of {len(batch)} records.")
+        for line_num, record in batch:
             try:
-                # 1. Insert/Update HF Model
-                await insert_hf_model(conn, model_record)
+                # Assume record processing starts successfully unless exception occurs
+                record_processed_successfully = True
+
+                # --- Process Hugging Face Model ---
+                hf_model_id = record.get("hf_model_id")
+                if hf_model_id:
+                    await insert_hf_model(conn, record)
+                else:
+                    logger.warning(f"Record on line {line_num} missing hf_model_id.")
+                    record_processed_successfully = False # Mark as unsuccessful if critical ID missing
+                    # continue # Optionally skip further processing for this record
 
-                # 2. Process linked papers
-                linked_papers = model_record.get("linked_papers", [])
+                # --- Process Linked Papers (Iterate through the list) ---
+                linked_papers = record.get("linked_papers", [])
                 if not isinstance(linked_papers, list):
-                    logger.warning(
-                        f"Skipping papers for model {hf_model_id} on line {line_num}: 'linked_papers' is not a list."
-                    )
-                    linked_papers = []
+                    logger.warning(f"Record on line {line_num}: 'linked_papers' is not a list. Skipping paper processing.")
+                    linked_papers = [] # Treat as empty list
+
+                # Use a flag to track if *any* paper was successfully processed for this model record
+                at_least_one_paper_processed = False
 
                 for paper_data in linked_papers:
                     if not isinstance(paper_data, dict):
-                        logger.warning(
-                            f"Skipping invalid paper entry for model {hf_model_id} on line {line_num}: not a dictionary."
-                        )
-                        continue
+                        logger.warning(f"Skipping invalid paper entry for model {hf_model_id} on line {line_num}: not a dictionary.")
+                        continue # Skip this invalid paper entry
 
-                    # 3. Get or Insert Paper
+                    # --- Process Single Paper (Get or Insert) ---
+                    # Pass the individual paper_data dictionary here
                     paper_id = await get_or_insert_paper(conn, paper_data)
 
+                    # --- Link Model and Paper (only if both exist) ---
+                    if hf_model_id and paper_id:
+                        await insert_model_paper_link(conn, hf_model_id, paper_id)
+                        at_least_one_paper_processed = True # Mark success if linked
+                    # Log cases where linking didn't happen (optional)
+                    # elif paper_id and not hf_model_id:
+                    #     logger.debug(f"Paper on line {line_num} processed but no HF model ID.")
+                    # elif hf_model_id and not paper_id:
+                    #     logger.debug(f"HF model {hf_model_id} on line {line_num} processed but paper insertion failed.")
+
+                    # --- Process PWC Relations and Repositories (only if paper was successfully inserted/found) ---
                     if paper_id:
-                        # 4. Link Model to Paper (Call updated function)
-                        await insert_model_paper_link(
-                            conn,
-                            hf_model_id,
-                            paper_id,
-                        )
-
-                        # 5. Insert PWC Relations (Tasks, Datasets, Repos)
+                        # IMPORTANT: Get pwc_entry from paper_data, not the top-level record
                         pwc_entry = paper_data.get("pwc_entry") or {}
                         await insert_pwc_relation(
-                            conn, paper_id, "tasks", pwc_entry.get("tasks")
+                            conn, paper_id, "task", pwc_entry.get("tasks")
+                        )
+                        await insert_pwc_relation(
+                            conn, paper_id, "method", pwc_entry.get("methods")
                         )
                         await insert_pwc_relation(
-                            conn, paper_id, "datasets", pwc_entry.get("datasets")
+                            conn,
+                            paper_id,
+                            "dataset",
+                            pwc_entry.get("datasets_used"), # Assuming field name
                         )
-                        # await insert_pwc_relation(conn, paper_id, "methods", pwc_entry.get("methods")) # Uncomment if methods are added
                         await insert_pwc_repositories(
                             conn, paper_id, pwc_entry.get("repositories")
                         )
+                        at_least_one_paper_processed = True # Also mark success here
+                    else:
+                        # If get_or_insert_paper returned None, the paper processing failed for this entry
+                        logger.warning(f"Failed to get or insert paper for entry in linked_papers on line {line_num}. Paper data: {paper_data}")
+                        # Consider if this failure should mark the whole model record as failed
+                        # record_processed_successfully = False
+
+                # Increment the main counter only if the model record itself was deemed successful
+                # (e.g., hf_model_id was present, and potentially if at least one paper linked if required)
+                if record_processed_successfully: # Adjust this condition based on requirements
+                    processed_in_batch += 1
+                    successful_lines_in_batch += 1 # Increment success counter
+                    logger.debug(f"Successfully processed record from line {line_num} (including linked papers if any).")
+                else:
+                    logger.warning(f"Marked record from line {line_num} as processed with errors/skips.")
+                    # Even if marked as error, we might count it towards the total lines *attempted* in the batch
+                    processed_in_batch += 1
 
-                processed_in_batch += 1
             except Exception as e:
-                logger.error(
-                    f"Error processing record for model {hf_model_id} on line {line_num}: {e}"
-                )
-                logger.error(traceback.format_exc())
-                # Decide whether to continue batch or raise error to rollback transaction
-                # For robustness, log error and continue processing other records in batch
-
-    return processed_in_batch
+                # Log detailed error including traceback and the problematic record line number
+                tb_str = traceback.format_exc()
+                logger.error(f"Error processing record from line {line_num}: {e}\\nRecord: {record}\\nTraceback:\\n{tb_str}")
+                # Re-raise the exception to trigger the automatic rollback of conn.transaction()
+                raise # This will rollback the *entire* batch
+
+    # If the 'with conn.transaction()' block completes without exceptions, commit is automatic.
+    # If an exception occurs, rollback is automatic.
+    logger.debug(f"Transaction for batch completed (Commit or Rollback occurred). Successfully processed {successful_lines_in_batch} lines in this attempt.")
+    # Return the count of lines successfully processed within the transaction
+    # If an exception caused rollback, this will be 0 from the perspective of the DB, but we return the count *attempted* before failure.
+    # Let's return successful_lines_in_batch to be more accurate about what potentially committed.
+    return successful_lines_in_batch
 
 
 async def main(input_file_path: str, reset_db: bool, reset_checkpoint: bool) -> None: