mitdbg · joycequu · Oct 2, 2025 · Oct 7, 2025 · Oct 13, 2025 · Oct 20, 2025
diff --git a/.gitignore b/.gitignore
@@ -25,6 +25,7 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+.DS_Store
 
 # PyInstaller
 #  Usually these files are written by a python script from a template
@@ -98,7 +99,7 @@ ipython_config.py
 #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
-#uv.lock
+uv.lock
 
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
@@ -173,7 +174,7 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
 
 # Abstra
 # Abstra is an AI-powered process automation framework.

diff --git a/baseline_quest/README.md b/baseline_quest/README.md
@@ -0,0 +1,45 @@
+# Baseline Experiments with QUEST
+
+## Experiment Setup
+1. Indexed with Chroma (`index_documents.py`)
+    - 512 tokens with 80-token overlap / index only first 512 tokens (same as QUEST)
+    - Embedded in batches of 256 chunks
+    - Embedding model: `bge-small-en-v1.5`
+    - Note: `index_documents.py` creates the collection if specified collection in `config.yaml` does not exist.
+
+2. Decompose (optional)
+    - Use gpt-4o-mini to decompose the query into subqueries connected with operators
+        - E.g. "Stoloniferous plants or crops originating from Bolivia" -> retrieve("crops from Bolivia", 100) | retrieve("stoloniferous plants", 100)
+        - E.g. "Neogene mammals of Africa that are Odd-toed ungulates" -> retrieve("Neogene mammals of Africa", 100) & retrieve("Odd-toed ungulates", 100)
+        - Examples decomposition python files: `examples/query_9` and `examples/query_10`
+    - Generate decompositions with `decompose_retrieve.py --mode generate`
+
+3. Retrieval (@ k)
+    - Retrieve with query / subquery (vector similarity)
+        - If indexed entired documented, retrieves the top 200 most likely chunks, then maps them to the document.
+        - If indexed first 512 tokens only, retrieves the top k most likely chunks.
+        - `semantic_retrieval.py`
+    - Retrieve after decomposition with query / subquery
+        - `decompose_retrieve.py --mode execute` executes all of the generated decomposition pythons scripts from step 2 and uses the same vectory similarity retrieval for each subquery.
+
+4. Calculate Recall@K
+    - For semantic retrieval, this step is done with the script `semantic_retrieval.py`.
+    - For decompose and retrieve, run `decompose_retrieve.py --mode analyze`.
+
+## Data
+The data is directly from QUEST (https://github.com/google-research/language/tree/master/language/quest#examples).
+- The documents that are embedded are: https://storage.googleapis.com/gresearch/quest/documents.zip
+- `data/train_subset1.jsonl` is 20 randomly sampled queries from `train.jsonl` of QUEST.
+- `data/train_subset2.jsonl` is 20 randomly sampled non-union queries from `train.jsonl` of QUEST.
+
+<!-- ## Retrieval
+Retrieval is done with `semantic_retrieval/retrieve.py`.
+- For decompose, we are retrieving titles only, and the code for this is written in `decompose/retrieve.py`
+- For vector similarity, we are retrieving (title, chunk) tuples (`INCLUDE_CHUNKS = true`) -->
+
+## Results:
+|              | Retrieve (entire document) | Retrieve (first 512 tokens) | Decompose + Retrieve* (entire document)                         | Decompose + Retrieve* (first 512 tokens)                        |
+|--------------|----------------------------|-----------------------------|-----------------------------------------------------------------|-----------------------------------------------------------------|
+| Recall @ 20  | 0.0886                     | 0.1127                      | -                                                               |                                                                 |
+| Recall @ 50  | 0.1663                     | 0.1593                      | 0.1560 (\|Pred\| = 61.10)                                       | 0.1617 (\|Pred\| = 60.70)                                       |
+| Recall @ 100 | 0.2122                     | 0.2250                      | 0.2285 (\|Pred\| = 205.95) (k for subqueries increased in size) | 0.2157 (\|Pred\| = 209.30) (k for subqueries increased in size) |
diff --git a/baseline_quest/config.yaml b/baseline_quest/config.yaml
@@ -0,0 +1,22 @@
+indexing:
+  embedding_model: "BAAI/bge-small-en-v1.5"
+  index_first_512: True       
+  chunk_size: 512
+  overlap: 80
+  batch_size: 512
+
+  chroma:
+    persist_dir: "./chroma_quest"       
+    collection: "quest_documents"
+
+data:
+  document_path: /orcd/home/002/joycequ/quest_data/documents.jsonl
+  queries_file: data/train_subset1.jsonl # gold path
+
+retrieval:
+  top_k: 100
+  include_chunks: True
+
+decomposition:
+  top_k: 100
+  llm_model: gpt-4o-mini
diff --git a/baseline_quest/data/train_subset1.jsonl b/baseline_quest/data/train_subset1.jsonl
diff --git a/baseline_quest/data/train_subset2.jsonl b/baseline_quest/data/train_subset2.jsonl
diff --git a/baseline_quest/data/train_subset_nonunion.jsonl b/baseline_quest/data/train_subset_nonunion.jsonl
diff --git a/baseline_quest/decompose_analyze_varying_k.py b/baseline_quest/decompose_analyze_varying_k.py
@@ -0,0 +1,92 @@
+import os
+import csv
+import argparse
+from typing import List, Set
+
+from lib.chroma_utils import read_jsonl
+
+def calculate_recall(gold_docs: Set[str], predicted_docs: List[str]) -> float:
+    if not gold_docs:
+        return 1.0
+
+    pred_set = set(predicted_docs)
+    intersection = gold_docs.intersection(pred_set)
+    return len(intersection) / len(gold_docs)
+
+def run_analysis(gold_path: str, pred_path: str, output_path: str, query_index: int):
+    # 2. Load Gold Data (using chroma_utils)
+    gold_examples = list(read_jsonl(gold_path))
+
+    # 3. Extract the target Query
+    target_gold = gold_examples[query_index]
+    gold_docs_set = set(target_gold.get("docs", []))
+    query_text = target_gold.get("query", "Unknown Query")
+
+    pred_examples = list(read_jsonl(pred_path))
+
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+
+    with open(output_path, "w", newline="", encoding="utf-8") as f_csv:
+        csv_writer = csv.writer(f_csv)
+        # CSV Header
+        csv_writer.writerow(["query", "k1", "k2", "recall_docs_1", "recall_docs_2", "recall_final"])
+
+        stats = {
+            "total_recall_1": 0.0,
+            "total_recall_2": 0.0,
+            "total_recall_final": 0.0,
+            "count": 0
+        }
+
+        for pred in pred_examples:
+            # Extract parameters
+            k1 = pred.get("k1")
+            k2 = pred.get("k2")
+
+            if k1 is None or k2 is None:
+                continue
+
+            docs_1 = pred.get("docs_1", [])
+            docs_2 = pred.get("docs_2", [])
+            docs_final = pred.get("docs", []) 
+
+            # Calculate Recall
+            r1 = calculate_recall(gold_docs_set, docs_1)
+            r2 = calculate_recall(gold_docs_set, docs_2)
+            r_final = calculate_recall(gold_docs_set, docs_final)
+
+            # Write row
+            csv_writer.writerow([
+                query_text, 
+                k1, 
+                k2, 
+                f"{r1:.6f}", 
+                f"{r2:.6f}", 
+                f"{r_final:.6f}"
+            ])
+
+            # Accumulate stats
+            stats["total_recall_1"] += r1
+            stats["total_recall_2"] += r2
+            stats["total_recall_final"] += r_final
+            stats["count"] += 1
+
+def main():
+    parser = argparse.ArgumentParser(description="Analyze Recall for varying k parameters.")
+
+    # Default args match your file structure
+    parser.add_argument("--gold", type=str, default="../../train_subset.jsonl", 
+                        help="Path to gold standard JSONL.")
+    parser.add_argument("--pred", type=str, default="pred_query_10_varying_k_modified.jsonl", 
+                        help="Path to predictions JSONL.")
+    parser.add_argument("--out", type=str, default="query_10_varying_k_results_modified.csv", 
+                        help="Output CSV path.")
+    parser.add_argument("--index", type=int, default=10, 
+                        help="Index of the query in the gold file (0-based). Default 10.")
+
+    args = parser.parse_args()
+
+    run_analysis(args.gold, args.pred, args.out, args.index)
+
+if __name__ == "__main__":
+    main()
diff --git a/baseline_quest/decompose_retrieve.py b/baseline_quest/decompose_retrieve.py
@@ -0,0 +1,104 @@
+import os
+import sys
+import yaml
+import logging
+import argparse
+
+# SQLite compatibility
+try:
+    __import__('pysqlite3')
+    sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
+except ImportError:
+    pass
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+# Import from lib
+from lib.decomposition_generator import generate_strategies
+from lib.decomposition_executor import execute_all_strategies
+from lib.retrieval_analyzer import analyze_results
+
+CONFIG_PATH = "config.yaml"
+
+def load_config(path: str):
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"Config file not found: {path}")
+    with open(path, 'r') as f:
+        return yaml.safe_load(f)
+
+def main():
+    parser = argparse.ArgumentParser(description="Run Decomposition Pipeline")
+    parser.add_argument(
+        "--mode", 
+        # Removed "auto" from choices
+        choices=["generate", "execute", "analyze"], 
+        default="generate",
+        help=(
+            "generate: Create strategy files only (then stop for review). "
+            "execute: Run existing strategy files and analyze results. "
+            "analyze: Run analysis on existing predictions only."
+        )
+    )
+    args = parser.parse_args()
+
+    logger.info(f"Loading configuration from {CONFIG_PATH}...")
+    config = load_config(CONFIG_PATH)
+
+    queries_path = config['data']['queries_file']
+    subset_name = os.path.splitext(os.path.basename(queries_path))[0]
+    target_k = config['decomposition'].get('top_k', 100)
+
+    strategies_dir = f"results/decomposition/{subset_name}_k{target_k}"
+    pred_output_path = f"results/decomposition/pred_set_ops_{subset_name}.jsonl"
+    report_output_path = f"results/decomposition/results/recall_report_{subset_name}.txt"
+
+    # MODE: GENERATE
+    if args.mode == "generate":
+        logger.info("=== Step 1: Generating Decompositions ===")
+        generate_strategies(
+            queries_path=queries_path,
+            output_dir=strategies_dir,
+            target_k=target_k,
+            llm_model=config.get('decomposition', {}).get('llm_model', 'gpt-4o-mini') 
+        )
+
+        logger.info("\n" + "="*60)
+        logger.info(f"GENERATION COMPLETE. Strategies saved to: {strategies_dir}")
+        logger.info("IMPORTANT: Please review/edit the generated Python files.")
+        logger.info("When ready, run this script again with: --mode execute")
+        logger.info("="*60 + "\n")
+        return # Stop execution after generation
+
+    # MODE: EXECUTE
+    elif args.mode == "execute":
+        logger.info("=== Step 2: Executing Strategies ===")
+
+        if not os.path.exists(strategies_dir) or not os.listdir(strategies_dir):
+            logger.error(f"No strategies found in {strategies_dir}. Run with --mode generate first.")
+            return
+
+        execute_all_strategies(
+            strategies_dir=strategies_dir,
+            queries_path=queries_path,
+            output_path=pred_output_path,
+            config=config
+        )
+        # Fall through to analysis (Step 3)
+
+    if args.mode in ["execute", "analyze"]:
+        logger.info("=== Step 3: Analyzing Results ===")
+        if os.path.exists(pred_output_path):
+            analyze_results(
+                gold_path=queries_path,
+                pred_path=pred_output_path,
+                output_report_path=report_output_path
+            )
+        else:
+            logger.error(f"Prediction file not found at {pred_output_path}. Cannot analyze.")
+
+    logger.info("Decomposition pipeline tasks completed.")
+
+if __name__ == "__main__":
+    main()
diff --git a/baseline_quest/examples/example_query.jsonl b/baseline_quest/examples/example_query.jsonl
@@ -0,0 +1,61 @@
+{
+    "query": "Non Horror demon novels.",
+    "docs": ["List of the Lost", "Blood Price", "The Black Spider", "The Castle in the Forest", "The Devil in Love (novel)", "Melmoth the Wanderer", "Practical Demonkeeping", "Artemis Fowl and the Lost Colony", "The Black Tattoo", "Good Omens", "Eric (novel)"], "original_query": "<mark>Demon novels</mark> that are not <mark>Horror novel series</mark>",
+    "scores": null,
+    "metadata": {
+        "template": "_ that are not _", 
+        "domain": "books", 
+        "fluency": ["Fluent: It is clear, and grammatically correct."], 
+        "meaning": ["Same Meaning: The paraphrased query asks for the same set of items as the original query. All the highlighted clauses are included."], "naturalness": ["Yes - A user could plausibly issue this query."],
+        "relevance_ratings": {
+            "List of the Lost": ["Definitely relevant"],
+            "Blood Price": ["Definitely relevant"],
+            "The Black Spider": ["Likely relevant"],
+            "The Castle in the Forest": ["Likely relevant"],
+            "The Devil in Love (novel)": ["Definitely relevant"],
+            "Melmoth the Wanderer": ["Definitely relevant"],
+            "Practical Demonkeeping": ["Likely relevant"],
+            "Artemis Fowl and the Lost Colony": ["Definitely relevant"],
+            "The Black Tattoo": ["Definitely relevant"],
+            "Good Omens": ["Likely relevant"],
+            "Eric (novel)": ["Definitely relevant"]},
+            "evidence_ratings": {"List of the Lost": ["Complete"],
+            "Blood Price": ["Complete"],
+            "The Black Spider": ["Partial"],
+            "The Castle in the Forest": ["Partial"],
+            "The Devil in Love (novel)": ["Complete"],
+            "Melmoth the Wanderer": ["Complete"],
+            "Practical Demonkeeping": ["Partial"],
+            "Artemis Fowl and the Lost Colony": ["Complete"],
+            "The Black Tattoo": ["Complete"],
+            "Good Omens": ["Partial"],
+            "Eric (novel)": ["Complete"]
+        },
+        "attributions": {
+            "List of the Lost": [{"Non Horror demon novels.": "The book is about a 1970s relay team in Boston who accidentally kill a homeless person, whose death brings misfortune to the team."}],
+            "Blood Price": [{"Non Horror demon novels.": "He tells her that the killer is a demon, that she actually did see him disappear."}],
+            "The Black Spider": [{"demon": "The hunter used his demonic powers to instill a curse in the kiss, which would ensure his payment."}],
+            "The Castle in the Forest": [{"Non Horror demon novels.": "'''''The Castle in the Forest''''' is the last novel by writer Norman Mailer, published in the year of his death, 2007. It is the story of Adolf Hitler's childhood as seen through the eyes of Dieter, a demon sent to put him on his destructive path. The novel explores the idea that Hitler was the product of incest. It forms a thematic contrast with the writer's immediately previous novel ''The Gospel According to the Son'' (1999), which deals with the early life of Jesus. It received a good deal of praise, including a glowing review from Lee Siegel of ''The New York Times Book Review'', and was the ''New York Times'' Bestseller for 2007."}],
+            "The Devil in Love (novel)": [{"Non Horror demon novels.": "Author of ''The Devil in Love, Jacques Cazotte''\n'''''The Devil in Love''''' (, 1772) is an occult romance by Jacques Cazotte which tells of a demon, or devil, who falls in love with a young Spanish nobleman named Don Alvaro, an amateur human dabbler, and attempts, in the guise of a young woman, to win his affections."}],
+            "Melmoth the Wanderer": [{"Non Horror demon novels.": "'''''Melmoth the Wanderer''''' is an 1820 Gothic novel by Irish playwright, novelist and clergyman Charles Maturin. The novel's titular character is a scholar who sold his soul to the devil in exchange for 150 extra years of life, and searches the world for someone who will take over the pact for him, in a manner reminiscent of the Wandering Jew."}],
+            "Practical Demonkeeping": [{"Non Horror demon novels": "His first novel, it deals with a demon from Hell and his master."}],
+            "Artemis Fowl and the Lost Colony": [{"Non Horror demon novels.": "In Barcelona, Spain, Artemis Fowl II and Butler, his bodyguard, wait for a demon. They suddenly encounter a demon who transports Artemis through time."}],
+            "The Black Tattoo": [{"Non Horror demon novels.": "'''''The Black Tattoo''''' is a young adult fantasy novel by Sam Enthoven, published in 2006. It deals with a boy, Charlie, becoming possessed by a demon that manifests itself in the form of a black tattoo on his body."}],
+            "Good Omens": [{"demon novels.": "There are attempts by the angel Aziraphale and the demon Crowley to sabotage the coming of the end times, having grown accustomed to their comfortable surroundings in England."}],
+            "Eric (novel)": [{"demon novels.": "the Demon King"}]
+        }
+    }
+}
+
+{
+    "query": "what are Oceanian realm fauna that are also both Birds of North America and Fauna of Europe", "docs": ["Sooty tern", "Bulwer's petrel", "Black noddy", "Bar-tailed godwit", "Masked booby", "Red-footed booby", "Roseate tern"],
+    "original_query": "<mark>Oceanian realm fauna</mark> that are also both <mark>Birds of North America</mark> and <mark>Fauna of Europe</mark>",
+    "scores": null,
+    "metadata": {
+        "template": "_ that are also both _ and _",
+        "relevance_ratings": null,
+        "evidence_ratings": null,
+        "attributions": null,
+        "domain": "animals"
+    }
+}
diff --git a/baseline_quest/examples/query_10.py b/baseline_quest/examples/query_10.py
@@ -0,0 +1,18 @@
+from typing import Dict
+
+# Strategy for Query 10: 1912 films set in England (No Truncation)
+def execute_query(retrieve, k1, k2):
+    films_1912_dict = retrieve("1912 films", k1)
+    films_set_in_england_dict = retrieve("English films", k2)
+
+    films_1912_titles = set(films_1912_dict.keys())
+    films_set_in_england_titles = set(films_set_in_england_dict.keys())
+
+    intersecting_titles = films_1912_titles & films_set_in_england_titles
+
+    final_docs_dict = {
+        title: films_1912_dict[title] 
+        for title in intersecting_titles
+    }
+
+    return final_docs_dict, films_1912_dict, films_set_in_england_dict
diff --git a/baseline_quest/examples/query_9.py b/baseline_quest/examples/query_9.py
@@ -0,0 +1,13 @@
+# Strategy for Query 9: romance films from New Zealand (No Truncation)
+def execute_query(retrieve, k1, k2):
+    # Step 1: Retrieve all romance films
+    romance_films = retrieve("romance films", k1)
+
+    # Step 2: Retrieve all films from New Zealand
+    new_zealand_films = retrieve("films from New Zealand", k2)
+
+    # Step 3: Intersect the two sets to find romance films specifically from New Zealand
+    romance_nz_films = romance_films & new_zealand_films
+
+    # Returning the result as a list
+    return list(romance_nz_films)