diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..e69de29 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f1ebd6e --- /dev/null +++ b/.gitignore @@ -0,0 +1,18 @@ +# Python artifacts +__pycache__/ +*.py[cod] +*.egg-info/ +# Virtual envs +venv/ +.env + +# Data +chroma/ +cache/ +*.apkg + +# study PDFs kept local +Dev/data/ + +# Other +.idea/ diff --git a/Dev/CONTRIBUTING.md b/Dev/CONTRIBUTING.md new file mode 100644 index 0000000..1ed7863 --- /dev/null +++ b/Dev/CONTRIBUTING.md @@ -0,0 +1,7 @@ +# Contributing + +1. Install dependencies using Poetry or `requirements.txt`. +2. Follow the existing module structure under `src/study_tools`. +3. Add tests for new functionality in `Dev/tests`. +4. Run `ruff`, `black`, and `pytest` before submitting a PR. +5. Document changes in `docs/changelog.md` and update `TODO.md` if needed. diff --git a/Dev/README.md b/Dev/README.md new file mode 100644 index 0000000..505a2ac --- /dev/null +++ b/Dev/README.md @@ -0,0 +1,22 @@ +# Study Tools Dev Package + +This `Dev/` directory houses the refactored implementation of the **Universal Study Tutor**. The old prototype remains in `messy_start/` for reference. +Course PDFs should be placed in `Dev/data/`, which is ignored by Git. + +## Features +- Configurable PDF ingestion and chunking +- Async summarisation using local Mistral and OpenAI GPT‑4o +- CLI tools for building the index, chat, flashcards and maintenance +- Learning Unit JSON schema with status counters and categories +- Externalised configuration via `config.yaml` +- Course PDFs stored locally in `Dev/data/` (see `docs/MIGRATE_LARGE_FILES.md`) + +## Quickstart +```bash +python -m pip install -r requirements.txt +python -m study_tools.build_index +python -m study_tools.summarize +python -m study_tools.cli_chat +``` + +See `docs/overview.md` for more details. diff --git a/Dev/agents.yaml b/Dev/agents.yaml new file mode 100644 index 0000000..cd5df75 --- /dev/null +++ b/Dev/agents.yaml @@ -0,0 +1,13 @@ +agents: + - name: Ingestor + role: Split PDFs into sentence-aware chunks and store them in Qdrant. + - name: Summariser + role: Summarise chunks using GPT-4o and cache results. + - name: Tagger + role: Classify chunks into categories with local Mistral. + - name: LUManager + role: Persist Learning Units with status counters and relations. + - name: Chat + role: Interactive Q&A and tutoring over the stored materials. + - name: FlashcardBuilder + role: Generate Anki-compatible decks from summaries. diff --git a/Dev/config.yaml b/Dev/config.yaml new file mode 100644 index 0000000..59d1e30 --- /dev/null +++ b/Dev/config.yaml @@ -0,0 +1,23 @@ +paths: + docs_dir: data + chroma_dir: chroma + cache_dir: cache +chunking: + chunk_size: 1024 + chunk_overlap: 128 + pages_per_group: 2 + page_overlap: 1 + chunk_group_limit: 6000 +models: + default: gpt-4o + tagging: mistral-7b-instruct + summarizer: gpt-4o +context_windows: + gpt-4o: 128000 + gpt-4-turbo: 128000 + gpt-4: 8192 + gpt-3.5-turbo: 16385 + mistral-7b-instruct: 32768 +limits: + tokens_per_minute: 40000 + token_margin: 512 diff --git a/Dev/docs/MIGRATE_LARGE_FILES.md b/Dev/docs/MIGRATE_LARGE_FILES.md new file mode 100644 index 0000000..6d6f148 --- /dev/null +++ b/Dev/docs/MIGRATE_LARGE_FILES.md @@ -0,0 +1,11 @@ +# Handling Large PDF Files + +Place course PDFs inside `Dev/data/` which is ignored by Git. They are not versioned by default. + +If repository limits become a problem later, you can retroactively move PDFs into Git LFS with: + +```bash +git lfs migrate import '*.pdf' +``` + +Otherwise keep the files locally and back them up to Google Drive or GCS as needed. diff --git a/Dev/docs/TODO.md b/Dev/docs/TODO.md new file mode 100644 index 0000000..f670be2 --- /dev/null +++ b/Dev/docs/TODO.md @@ -0,0 +1,23 @@ +# TODO Backlog + +## P0 +- Centralised configuration loader (`utils.load_config`). +- Remove hard coded paths; read from `config.yaml`. +- Store PDFs in `Dev/data/` (optionally migrate to Git LFS later). + +## P1 +- OCR fallback and duplicate detection during ingestion. +- Implement KnowledgeNode graph with status counters. +- Tagging pipeline using local Mistral model. +- CLI commands via `python -m study_tools `. + +## P2 +- Evaluation harness (ROUGE-L, entity overlap, manual rubric). +- Streamlit MVP for progress view. + +## P3 +- Difficulty-graded exam question generator (IRT). +- Anki `*.apkg` exporter with AnkiConnect. + +## P4 +- Visual progress dashboard and Obsidian vault export. diff --git a/Dev/docs/changelog.md b/Dev/docs/changelog.md new file mode 100644 index 0000000..6f2d351 --- /dev/null +++ b/Dev/docs/changelog.md @@ -0,0 +1,7 @@ +# Changelog + +## 2025-07-03 +- Initial refactor: new `Dev/` package created. +- Configuration moved to `config.yaml`. +- PDFs now stored in `Dev/data/`; Git LFS usage is optional. +- Migrated documentation and created skeleton tests. diff --git a/chunking_model_report.md b/Dev/docs/chunking_model_report.md similarity index 100% rename from chunking_model_report.md rename to Dev/docs/chunking_model_report.md diff --git a/Dev/docs/overview.md b/Dev/docs/overview.md new file mode 100644 index 0000000..0e0b660 --- /dev/null +++ b/Dev/docs/overview.md @@ -0,0 +1,12 @@ +# Overview + +The Dev package implements the second iteration of the study bot based on the **Hybrid‑Edge** architecture: + +- **Local tagging** with Mistral‑7B‑Instruct classifies text chunks into categories. +- **GPT‑4o/4.1** performs heavy summarisation and tutoring logic. +- **SQLite** stores metadata and Learning Units. **Qdrant** provides vector search. +- Outputs are plain JSON which are rendered to Markdown files. + +Course PDFs belong in `Dev/data/` and are not tracked in Git. + +Scripts read defaults from `config.yaml` so chunk sizes and model names are easily changed. diff --git a/Dev/pyproject.toml b/Dev/pyproject.toml new file mode 100644 index 0000000..e6ec8c5 --- /dev/null +++ b/Dev/pyproject.toml @@ -0,0 +1,23 @@ +[tool.poetry] +name = "study-tools" +version = "0.2.0" +description = "Universal Study Tutor" +authors = ["Study Bot Team"] +packages = [{include = "study_tools", from = "src"}] + +[tool.poetry.dependencies] +python = "^3.12" +llama-index-core = "*" +llama-index-llms-openai = "*" +chromadb = "*" +tiktoken = "*" +tenacity = "*" +qdrant-client = "*" +genanki = "*" +tqdm = "*" +pyyaml = "*" + +[tool.poetry.group.dev.dependencies] +pytest = "*" +ruff = "*" +black = "*" diff --git a/Dev/requirements.txt b/Dev/requirements.txt new file mode 100644 index 0000000..1c96c7e --- /dev/null +++ b/Dev/requirements.txt @@ -0,0 +1,9 @@ +llama-index-core +llama-index-llms-openai +chromadb +tiktoken +tenacity +qdrant-client +genanki +tqdm +pyyaml diff --git a/schema/learning_unit.schema.json b/Dev/schema.json similarity index 100% rename from schema/learning_unit.schema.json rename to Dev/schema.json diff --git a/Dev/src/study_tools/__init__.py b/Dev/src/study_tools/__init__.py new file mode 100644 index 0000000..e115bc3 --- /dev/null +++ b/Dev/src/study_tools/__init__.py @@ -0,0 +1,11 @@ +"""Study Tools package.""" + +__all__ = [ + "build_index", + "summarize", + "cli_chat", + "flashcards", + "ingest", + "reset", + "utils", +] diff --git a/Dev/src/study_tools/build_index.py b/Dev/src/study_tools/build_index.py new file mode 100644 index 0000000..784c433 --- /dev/null +++ b/Dev/src/study_tools/build_index.py @@ -0,0 +1,67 @@ +"""PDF ingestion and vector index creation.""" + +from pathlib import Path +import shutil + +# Heavy imports are done inside functions to allow importing this module without +# optional dependencies. + +from .utils import load_config + + +def extract_pages(pdf_path: Path, pages_per_group: int, overlap: int): + import fitz # PyMuPDF + from llama_index.core import Document + doc = fitz.open(pdf_path) + for i in range(0, len(doc), pages_per_group - overlap): + end = min(i + pages_per_group, len(doc)) + text = "\n\n".join(doc[pg].get_text() for pg in range(i, end)) + meta = { + "file_path": str(pdf_path), + "file_name": pdf_path.name, + "page_start": i + 1, + "page_end": end, + } + yield Document(text=text, metadata=meta) + + +def main(): + from llama_index.core import VectorStoreIndex, StorageContext, Document + from llama_index.core.node_parser import SentenceSplitter + from llama_index.vector_stores.qdrant import QdrantVectorStore + from qdrant_client import QdrantClient + + cfg = load_config() + paths = cfg["paths"] + docs_dir = Path(paths["docs_dir"]) + chroma_dir = Path(paths["chroma_dir"]) + chunk = cfg["chunking"] + + if chroma_dir.exists(): + shutil.rmtree(chroma_dir) + + docs = [] + for pdf in docs_dir.rglob("*.pdf"): + docs.extend( + extract_pages( + pdf, + chunk["pages_per_group"], + chunk["page_overlap"], + ) + ) + + splitter = SentenceSplitter( + chunk_size=chunk["chunk_size"], + chunk_overlap=chunk["chunk_overlap"], + ) + nodes = splitter.get_nodes_from_documents(docs) + + client = QdrantClient(path=str(chroma_dir)) + store = QdrantVectorStore(client, collection_name="study") + storage = StorageContext.from_defaults(vector_store=store) + VectorStoreIndex(nodes, storage_context=storage) + storage.persist(persist_dir=str(chroma_dir)) + + +if __name__ == "__main__": + main() diff --git a/Dev/src/study_tools/cli_chat.py b/Dev/src/study_tools/cli_chat.py new file mode 100644 index 0000000..88f8133 --- /dev/null +++ b/Dev/src/study_tools/cli_chat.py @@ -0,0 +1,43 @@ +"""CLI chat interface.""" + +import argparse +from pathlib import Path + +# heavy imports done in main() + +from .utils import load_config + + +def main(): + from llama_index.core import StorageContext, load_index_from_storage + from llama_index.llms.openai import OpenAI + from llama_index.vector_stores.qdrant import QdrantVectorStore + from qdrant_client import QdrantClient + + cfg = load_config() + llm = OpenAI(model=cfg["models"]["summarizer"]) + chroma_path = cfg["paths"]["chroma_dir"] + client = QdrantClient(path=chroma_path) + store = QdrantVectorStore(client, collection_name="study") + storage = StorageContext.from_defaults(persist_dir=chroma_path, vector_store=store) + index = load_index_from_storage(storage) + engine = index.as_chat_engine(chat_mode="condense_question", llm=llm, verbose=True) + + parser = argparse.ArgumentParser() + parser.add_argument("question", nargs="*") + args = parser.parse_args() + + if args.question: + q = " ".join(args.question) + print(engine.chat(q).response) + else: + print("Ask questions (blank to exit)") + while True: + q = input("? ") + if not q.strip(): + break + print(engine.chat(q).response) + + +if __name__ == "__main__": + main() diff --git a/Dev/src/study_tools/flashcards.py b/Dev/src/study_tools/flashcards.py new file mode 100644 index 0000000..86613c3 --- /dev/null +++ b/Dev/src/study_tools/flashcards.py @@ -0,0 +1,39 @@ +"""Generate Anki deck from summaries.""" + +import uuid +from pathlib import Path + +# heavy imports in main() + +from .utils import load_config + + +def main(): + import genanki + from llama_index.core import StorageContext, load_index_from_storage + from llama_index.vector_stores.qdrant import QdrantVectorStore + from qdrant_client import QdrantClient + + cfg = load_config() + chroma_path = cfg["paths"]["chroma_dir"] + client = QdrantClient(path=chroma_path) + store = QdrantVectorStore(client, collection_name="study") + storage = StorageContext.from_defaults(persist_dir=chroma_path, vector_store=store) + index = load_index_from_storage(storage) + retriever = index.as_retriever(similarity_top_k=50) + + deck = genanki.Deck(uuid.uuid4().int >> 64, "Study-Bot Deck") + for node in index.docstore.docs.values(): + qa = retriever.query(f"Turn this into Q&A flashcards:\n\n{node.text}").response + for line in qa.splitlines(): + if "?" in line: + q, a = line.split("?", 1) + note = genanki.Note(model=genanki.BASIC_MODEL, fields=[q.strip()+"?", a.strip()]) + deck.add_note(note) + + genanki.Package(deck).write_to_file("study.apkg") + print("study.apkg ready – import into Anki") + + +if __name__ == "__main__": + main() diff --git a/Dev/src/study_tools/ingest.py b/Dev/src/study_tools/ingest.py new file mode 100644 index 0000000..e850100 --- /dev/null +++ b/Dev/src/study_tools/ingest.py @@ -0,0 +1,17 @@ +"""Simple document count utility.""" + +from pathlib import Path + +from .utils import load_config + + +def main(): + from llama_index.core import SimpleDirectoryReader + cfg = load_config() + docs_dir = Path(cfg["paths"]["docs_dir"]) + docs = SimpleDirectoryReader(str(docs_dir)).load_data() + print(f"Loaded {len(docs)} docs") + + +if __name__ == "__main__": + main() diff --git a/Dev/src/study_tools/reset.py b/Dev/src/study_tools/reset.py new file mode 100644 index 0000000..6a25308 --- /dev/null +++ b/Dev/src/study_tools/reset.py @@ -0,0 +1,25 @@ +"""Remove generated data.""" + +import shutil +from pathlib import Path + +from .utils import load_config + + +def main(): + cfg = load_config() + paths = cfg["paths"] + for key in ("chroma_dir", "cache_dir"): + p = Path(paths[key]) + if p.exists(): + shutil.rmtree(p) + print(f"Deleted {p}") + for f in ("summary.md", "summary.pdf", "study.apkg"): + fp = Path(f) + if fp.exists(): + fp.unlink() + print(f"Deleted {fp}") + + +if __name__ == "__main__": + main() diff --git a/Dev/src/study_tools/summarize.py b/Dev/src/study_tools/summarize.py new file mode 100644 index 0000000..f71aef1 --- /dev/null +++ b/Dev/src/study_tools/summarize.py @@ -0,0 +1,77 @@ +"""Async summarisation pipeline.""" + +from __future__ import annotations + +import asyncio +import json +import hashlib +from pathlib import Path +from typing import Sequence, List, Any + +# optional dependencies are imported lazily + +from .utils import load_config + + +async def _complete(llm: Any, prompt: str) -> str: + from tenacity import retry, stop_after_attempt, wait_exponential_jitter + + @retry(stop=stop_after_attempt(6), wait=wait_exponential_jitter()) + async def _call() -> str: + return await asyncio.to_thread(lambda: llm.complete(prompt).text.strip()) + + return await _call() + + +def _sha(node: Any) -> str: + payload = node.text + json.dumps(node.metadata, sort_keys=True) + return hashlib.sha256(payload.encode("utf-8")).hexdigest() + + +def summarise_group(llm: Any, nodes: Sequence[Any], lang: str, cache_dir: Path) -> List[str]: + results = [] + for node in nodes: + key = _sha(node) + fp = cache_dir / f"{key}.md" + if fp.exists(): + results.append(fp.read_text(encoding="utf-8")) + continue + prompt = f"Summarise in {lang}:\n\n{node.text}" + text = asyncio.run(_complete(llm, prompt)) + fp.write_text(text, encoding="utf-8") + results.append(text) + return results + + +def main(): + from llama_index.core import StorageContext, load_index_from_storage + from llama_index.core.schema import TextNode + from llama_index.llms.openai import OpenAI + from llama_index.vector_stores.qdrant import QdrantVectorStore + from qdrant_client import QdrantClient + from tqdm import tqdm + + cfg = load_config() + paths = cfg["paths"] + cache_dir = Path(paths["cache_dir"]) + cache_dir.mkdir(parents=True, exist_ok=True) + + llm = OpenAI(model=cfg["models"]["summarizer"]) + + client = QdrantClient(path=paths["chroma_dir"]) + store = QdrantVectorStore(client, collection_name="study") + storage = StorageContext.from_defaults(persist_dir=paths["chroma_dir"], vector_store=store) + index = load_index_from_storage(storage) + + nodes = [n for n in index.docstore.docs.values() if isinstance(n, TextNode)] + groups = [nodes[i:i+5] for i in range(0, len(nodes), 5)] + + summaries = [] + for g in tqdm(groups): + summaries.extend(summarise_group(llm, g, "English", cache_dir)) + + Path("summary.md").write_text("\n\n".join(summaries), encoding="utf-8") + + +if __name__ == "__main__": + main() diff --git a/Dev/src/study_tools/utils.py b/Dev/src/study_tools/utils.py new file mode 100644 index 0000000..73cb830 --- /dev/null +++ b/Dev/src/study_tools/utils.py @@ -0,0 +1,14 @@ +from pathlib import Path + +DEFAULT_CONFIG_PATH = Path(__file__).resolve().parents[2] / "config.yaml" + +_cached_cfg = None + + +def load_config(path: Path | str = DEFAULT_CONFIG_PATH): + global _cached_cfg + if _cached_cfg is None or path != DEFAULT_CONFIG_PATH: + import yaml + with open(path, "r", encoding="utf-8") as fh: + _cached_cfg = yaml.safe_load(fh) + return _cached_cfg diff --git a/Dev/tests/__init__.py b/Dev/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Dev/tests/test_imports.py b/Dev/tests/test_imports.py new file mode 100644 index 0000000..dee0daf --- /dev/null +++ b/Dev/tests/test_imports.py @@ -0,0 +1,15 @@ +import importlib + +modules = [ + 'study_tools.build_index', + 'study_tools.summarize', + 'study_tools.cli_chat', + 'study_tools.flashcards', + 'study_tools.ingest', + 'study_tools.reset', + 'study_tools.utils', +] + +def test_imports(): + for m in modules: + importlib.import_module(m) diff --git a/README.md b/README.md index c239810..6eeb476 100644 --- a/README.md +++ b/README.md @@ -1,56 +1,8 @@ -# Study Tools +# Study Tools Repository -This repository contains an experimental study bot that ingests PDF course material, stores it in a vector database and generates language‑aware summaries, flashcards and chat‑style Q&A. The code base is in flux and will be refactored into a modular tutoring system. +This repo contains two versions of the study bot: -## Current State +- `messy_start/` – original prototype kept for reference. +- `Dev/` – cleaned Python package ready for further development. -* `build_index.py` loads PDFs from `docs/` and creates a Chroma vector store. -* `summarize_improved.py` summarises every chunk with OpenAI models, caches the results and tree‑merges them into a `summary.md` / optional PDF. -* `chat.py` provides a CLI chat interface backed by the vector store. -* `flashcards.py` turns retrieved chunks into an Anki deck. -* Several legacy scripts exist and can be ignored during refactor. - -## Setup - -1. Install Python 3.11+. -2. `pip install llama-index-core llama-index-llms-openai chromadb tiktoken genanki tenacity tqdm pypandoc` -3. Export your OpenAI API key: `export OPENAI_API_KEY=sk-...` -4. Optionally set the model via environment variable or edit `config.json`. - -## Usage - -Build the index: - -```bash -python build_index.py -``` - -Generate summaries (cached, async): - -```bash -python summarize_improved.py --no-pdf -``` - -Chat with the material: - -```bash -python chat.py -``` - -Create flashcards: - -```bash -python flashcards.py -``` - -To ingest new PDFs, place them under `docs/` (or a sub‑folder per module) and rerun `build_index.py`. - -## Architecture Overview - -1. **Ingestor** – splits PDFs into overlapping text chunks and stores them in Chroma. -2. **Summariser** – summarises each chunk and reduces by module. -3. **Merger** – combines module summaries into a final exam guide. -4. **Chat/Q&A** – retrieves relevant chunks for user questions. -5. **Flashcard Builder** – converts chunk summaries into Anki cards. - -The refactor aims to replace ad‑hoc scripts with reusable modules and a CLI entry point. JSON based "Learning Units" (see `agents.md`) will track progress and relations between pieces of knowledge. +See `Dev/README.md` for usage of the new version. diff --git a/Corperate_Finance_download/01 General Information 2025F-3.pdf b/messy_start/Corperate_Finance_download/01 General Information 2025F-3.pdf similarity index 100% rename from Corperate_Finance_download/01 General Information 2025F-3.pdf rename to messy_start/Corperate_Finance_download/01 General Information 2025F-3.pdf diff --git a/Corperate_Finance_download/02 Risk Return 2025F.pdf b/messy_start/Corperate_Finance_download/02 Risk Return 2025F.pdf similarity index 100% rename from Corperate_Finance_download/02 Risk Return 2025F.pdf rename to messy_start/Corperate_Finance_download/02 Risk Return 2025F.pdf diff --git a/Corperate_Finance_download/03 Portfolio Theory CAPM 2025F.pdf b/messy_start/Corperate_Finance_download/03 Portfolio Theory CAPM 2025F.pdf similarity index 100% rename from Corperate_Finance_download/03 Portfolio Theory CAPM 2025F.pdf rename to messy_start/Corperate_Finance_download/03 Portfolio Theory CAPM 2025F.pdf diff --git a/Corperate_Finance_download/04 Corporate Valuation 2025F.pdf b/messy_start/Corperate_Finance_download/04 Corporate Valuation 2025F.pdf similarity index 100% rename from Corperate_Finance_download/04 Corporate Valuation 2025F.pdf rename to messy_start/Corperate_Finance_download/04 Corporate Valuation 2025F.pdf diff --git a/Corperate_Finance_download/05 Cost of Capital 2025F.pdf b/messy_start/Corperate_Finance_download/05 Cost of Capital 2025F.pdf similarity index 100% rename from Corperate_Finance_download/05 Cost of Capital 2025F.pdf rename to messy_start/Corperate_Finance_download/05 Cost of Capital 2025F.pdf diff --git a/Corperate_Finance_download/06 Corporate Finance Issuance of Securities 2025F.pdf b/messy_start/Corperate_Finance_download/06 Corporate Finance Issuance of Securities 2025F.pdf similarity index 100% rename from Corperate_Finance_download/06 Corporate Finance Issuance of Securities 2025F.pdf rename to messy_start/Corperate_Finance_download/06 Corporate Finance Issuance of Securities 2025F.pdf diff --git a/Corperate_Finance_download/07 Capital Structure 2025F.pdf b/messy_start/Corperate_Finance_download/07 Capital Structure 2025F.pdf similarity index 100% rename from Corperate_Finance_download/07 Capital Structure 2025F.pdf rename to messy_start/Corperate_Finance_download/07 Capital Structure 2025F.pdf diff --git a/Corperate_Finance_download/08 Risk Management 2025F.pdf b/messy_start/Corperate_Finance_download/08 Risk Management 2025F.pdf similarity index 100% rename from Corperate_Finance_download/08 Risk Management 2025F.pdf rename to messy_start/Corperate_Finance_download/08 Risk Management 2025F.pdf diff --git a/Corperate_Finance_download/09 Corporate Governance 2025F.pdf b/messy_start/Corperate_Finance_download/09 Corporate Governance 2025F.pdf similarity index 100% rename from Corperate_Finance_download/09 Corporate Governance 2025F.pdf rename to messy_start/Corperate_Finance_download/09 Corporate Governance 2025F.pdf diff --git a/Corperate_Finance_download/10 Efficient Markets and Behavioral Finance 2025F.pdf b/messy_start/Corperate_Finance_download/10 Efficient Markets and Behavioral Finance 2025F.pdf similarity index 100% rename from Corperate_Finance_download/10 Efficient Markets and Behavioral Finance 2025F.pdf rename to messy_start/Corperate_Finance_download/10 Efficient Markets and Behavioral Finance 2025F.pdf diff --git a/Corperate_Finance_download/11 Empirical Research 2025F.pdf b/messy_start/Corperate_Finance_download/11 Empirical Research 2025F.pdf similarity index 100% rename from Corperate_Finance_download/11 Empirical Research 2025F.pdf rename to messy_start/Corperate_Finance_download/11 Empirical Research 2025F.pdf diff --git a/Corperate_Finance_download/Aviva Investors Student 2022.pdf b/messy_start/Corperate_Finance_download/Aviva Investors Student 2022.pdf similarity index 100% rename from Corperate_Finance_download/Aviva Investors Student 2022.pdf rename to messy_start/Corperate_Finance_download/Aviva Investors Student 2022.pdf diff --git a/Corperate_Finance_download/Cohn, J. B., and Wardlaw, M. I., 2016, Financing constraints and workplace safety, The Journal of Finance, 71(5), 2017-2058.pdf b/messy_start/Corperate_Finance_download/Cohn, J. B., and Wardlaw, M. I., 2016, Financing constraints and workplace safety, The Journal of Finance, 71(5), 2017-2058.pdf similarity index 100% rename from Corperate_Finance_download/Cohn, J. B., and Wardlaw, M. I., 2016, Financing constraints and workplace safety, The Journal of Finance, 71(5), 2017-2058.pdf rename to messy_start/Corperate_Finance_download/Cohn, J. B., and Wardlaw, M. I., 2016, Financing constraints and workplace safety, The Journal of Finance, 71(5), 2017-2058.pdf diff --git a/Corperate_Finance_download/Cox Communications Inc Student 2019.pdf b/messy_start/Corperate_Finance_download/Cox Communications Inc Student 2019.pdf similarity index 100% rename from Corperate_Finance_download/Cox Communications Inc Student 2019.pdf rename to messy_start/Corperate_Finance_download/Cox Communications Inc Student 2019.pdf diff --git a/Corperate_Finance_download/ESG Financing Constraints and Workplace Safety Solution 2019.pdf b/messy_start/Corperate_Finance_download/ESG Financing Constraints and Workplace Safety Solution 2019.pdf similarity index 100% rename from Corperate_Finance_download/ESG Financing Constraints and Workplace Safety Solution 2019.pdf rename to messy_start/Corperate_Finance_download/ESG Financing Constraints and Workplace Safety Solution 2019.pdf diff --git a/Corperate_Finance_download/ESG Financing Constraints and Workplace Safety Student 2019.pdf b/messy_start/Corperate_Finance_download/ESG Financing Constraints and Workplace Safety Student 2019.pdf similarity index 100% rename from Corperate_Finance_download/ESG Financing Constraints and Workplace Safety Student 2019.pdf rename to messy_start/Corperate_Finance_download/ESG Financing Constraints and Workplace Safety Student 2019.pdf diff --git a/Corperate_Finance_download/First session Solution-4.pdf b/messy_start/Corperate_Finance_download/First session Solution-4.pdf similarity index 100% rename from Corperate_Finance_download/First session Solution-4.pdf rename to messy_start/Corperate_Finance_download/First session Solution-4.pdf diff --git a/Corperate_Finance_download/First session-4.pdf b/messy_start/Corperate_Finance_download/First session-4.pdf similarity index 100% rename from Corperate_Finance_download/First session-4.pdf rename to messy_start/Corperate_Finance_download/First session-4.pdf diff --git a/Corperate_Finance_download/Formulary.pdf b/messy_start/Corperate_Finance_download/Formulary.pdf similarity index 100% rename from Corperate_Finance_download/Formulary.pdf rename to messy_start/Corperate_Finance_download/Formulary.pdf diff --git a/Corperate_Finance_download/Fourth session Solution-6.pdf b/messy_start/Corperate_Finance_download/Fourth session Solution-6.pdf similarity index 100% rename from Corperate_Finance_download/Fourth session Solution-6.pdf rename to messy_start/Corperate_Finance_download/Fourth session Solution-6.pdf diff --git a/Corperate_Finance_download/Fourth session-5.pdf b/messy_start/Corperate_Finance_download/Fourth session-5.pdf similarity index 100% rename from Corperate_Finance_download/Fourth session-5.pdf rename to messy_start/Corperate_Finance_download/Fourth session-5.pdf diff --git a/Corperate_Finance_download/Mock Exam - Solution-1.pdf b/messy_start/Corperate_Finance_download/Mock Exam - Solution-1.pdf similarity index 100% rename from Corperate_Finance_download/Mock Exam - Solution-1.pdf rename to messy_start/Corperate_Finance_download/Mock Exam - Solution-1.pdf diff --git a/Corperate_Finance_download/Mock Exam-1.pdf b/messy_start/Corperate_Finance_download/Mock Exam-1.pdf similarity index 100% rename from Corperate_Finance_download/Mock Exam-1.pdf rename to messy_start/Corperate_Finance_download/Mock Exam-1.pdf diff --git a/Corperate_Finance_download/NormDist.pdf b/messy_start/Corperate_Finance_download/NormDist.pdf similarity index 100% rename from Corperate_Finance_download/NormDist.pdf rename to messy_start/Corperate_Finance_download/NormDist.pdf diff --git a/Corperate_Finance_download/Second session Solution-4.pdf b/messy_start/Corperate_Finance_download/Second session Solution-4.pdf similarity index 100% rename from Corperate_Finance_download/Second session Solution-4.pdf rename to messy_start/Corperate_Finance_download/Second session Solution-4.pdf diff --git a/Corperate_Finance_download/Second session-3.pdf b/messy_start/Corperate_Finance_download/Second session-3.pdf similarity index 100% rename from Corperate_Finance_download/Second session-3.pdf rename to messy_start/Corperate_Finance_download/Second session-3.pdf diff --git a/Corperate_Finance_download/Third session Solution-5.pdf b/messy_start/Corperate_Finance_download/Third session Solution-5.pdf similarity index 100% rename from Corperate_Finance_download/Third session Solution-5.pdf rename to messy_start/Corperate_Finance_download/Third session Solution-5.pdf diff --git a/Corperate_Finance_download/Third session-5.pdf b/messy_start/Corperate_Finance_download/Third session-5.pdf similarity index 100% rename from Corperate_Finance_download/Third session-5.pdf rename to messy_start/Corperate_Finance_download/Third session-5.pdf diff --git a/Corperate_Finance_download/UBS Presentation Uni SG May 2025.pdf b/messy_start/Corperate_Finance_download/UBS Presentation Uni SG May 2025.pdf similarity index 100% rename from Corperate_Finance_download/UBS Presentation Uni SG May 2025.pdf rename to messy_start/Corperate_Finance_download/UBS Presentation Uni SG May 2025.pdf diff --git a/Corperate_Finance_download/Valuation - Consolidated Edison and Chemex AG Solution 2019.pdf b/messy_start/Corperate_Finance_download/Valuation - Consolidated Edison and Chemex AG Solution 2019.pdf similarity index 100% rename from Corperate_Finance_download/Valuation - Consolidated Edison and Chemex AG Solution 2019.pdf rename to messy_start/Corperate_Finance_download/Valuation - Consolidated Edison and Chemex AG Solution 2019.pdf diff --git a/Corperate_Finance_download/Valuation - Consolidated Edison and Chemex AG Student 2019.pdf b/messy_start/Corperate_Finance_download/Valuation - Consolidated Edison and Chemex AG Student 2019.pdf similarity index 100% rename from Corperate_Finance_download/Valuation - Consolidated Edison and Chemex AG Student 2019.pdf rename to messy_start/Corperate_Finance_download/Valuation - Consolidated Edison and Chemex AG Student 2019.pdf diff --git a/Corperate_Finance_download/Valuing Snap after the IPO quiet period Student 2019.pdf b/messy_start/Corperate_Finance_download/Valuing Snap after the IPO quiet period Student 2019.pdf similarity index 100% rename from Corperate_Finance_download/Valuing Snap after the IPO quiet period Student 2019.pdf rename to messy_start/Corperate_Finance_download/Valuing Snap after the IPO quiet period Student 2019.pdf diff --git a/messy_start/README.md b/messy_start/README.md new file mode 100644 index 0000000..c239810 --- /dev/null +++ b/messy_start/README.md @@ -0,0 +1,56 @@ +# Study Tools + +This repository contains an experimental study bot that ingests PDF course material, stores it in a vector database and generates language‑aware summaries, flashcards and chat‑style Q&A. The code base is in flux and will be refactored into a modular tutoring system. + +## Current State + +* `build_index.py` loads PDFs from `docs/` and creates a Chroma vector store. +* `summarize_improved.py` summarises every chunk with OpenAI models, caches the results and tree‑merges them into a `summary.md` / optional PDF. +* `chat.py` provides a CLI chat interface backed by the vector store. +* `flashcards.py` turns retrieved chunks into an Anki deck. +* Several legacy scripts exist and can be ignored during refactor. + +## Setup + +1. Install Python 3.11+. +2. `pip install llama-index-core llama-index-llms-openai chromadb tiktoken genanki tenacity tqdm pypandoc` +3. Export your OpenAI API key: `export OPENAI_API_KEY=sk-...` +4. Optionally set the model via environment variable or edit `config.json`. + +## Usage + +Build the index: + +```bash +python build_index.py +``` + +Generate summaries (cached, async): + +```bash +python summarize_improved.py --no-pdf +``` + +Chat with the material: + +```bash +python chat.py +``` + +Create flashcards: + +```bash +python flashcards.py +``` + +To ingest new PDFs, place them under `docs/` (or a sub‑folder per module) and rerun `build_index.py`. + +## Architecture Overview + +1. **Ingestor** – splits PDFs into overlapping text chunks and stores them in Chroma. +2. **Summariser** – summarises each chunk and reduces by module. +3. **Merger** – combines module summaries into a final exam guide. +4. **Chat/Q&A** – retrieves relevant chunks for user questions. +5. **Flashcard Builder** – converts chunk summaries into Anki cards. + +The refactor aims to replace ad‑hoc scripts with reusable modules and a CLI entry point. JSON based "Learning Units" (see `agents.md`) will track progress and relations between pieces of knowledge. diff --git a/agents.md b/messy_start/docs/agents.md similarity index 100% rename from agents.md rename to messy_start/docs/agents.md diff --git a/file_audit.md b/messy_start/docs/file_audit.md similarity index 100% rename from file_audit.md rename to messy_start/docs/file_audit.md diff --git a/summary.md b/messy_start/docs/summary.md similarity index 100% rename from summary.md rename to messy_start/docs/summary.md diff --git a/todo.md b/messy_start/docs/todo.md similarity index 100% rename from todo.md rename to messy_start/docs/todo.md diff --git a/schema/example_learning_unit.json b/messy_start/schema/example_learning_unit.json similarity index 100% rename from schema/example_learning_unit.json rename to messy_start/schema/example_learning_unit.json diff --git a/messy_start/schema/learning_unit.schema.json b/messy_start/schema/learning_unit.schema.json new file mode 100644 index 0000000..d5a718e --- /dev/null +++ b/messy_start/schema/learning_unit.schema.json @@ -0,0 +1,20 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "LearningUnit", + "type": "object", + "required": ["id", "text", "categories", "source", "status", "lang"], + "properties": { + "id": {"type": "string", "description": "UUID or short identifier"}, + "text": {"type": "string", "description": "core content"}, + "categories": {"type": "array", "items": {"type": "string"}}, + "source": {"type": "string", "description": "PDF name and page/chunk"}, + "status": {"type": "integer", "description": "0 unknown, >0 seen, -1 known"}, + "related_to": { + "type": "array", + "items": {"type": "string"}, + "description": "other LU ids" + }, + "anki_card_id": {"type": ["string", "null"], "description": "linked Anki card"}, + "lang": {"type": "string"} + } +}