civitas-cerebrum · Umutayb · Feb 19, 2026 · Oct 17, 2025 · Oct 17, 2025 · Oct 17, 2025
diff --git a/dataset/acquisition/generate_questions/question_generation.py b/dataset/acquisition/generate_questions/question_generation.py
diff --git a/dataset/enrichment/qa_generation.py b/dataset/enrichment/qa_generation.py
@@ -0,0 +1,113 @@
+import os
+import configparser
+import requests
+import json
+
+def generate_qna_dataset(prompt="You are an expert in {domain_of_expertise}. Generate questions and answers based on the text content you are provided with." ,model_expertise="Software Engineering", input_dir="../acquisition/temp/text_data", base_url="http://localhost:8080/api/generate", model_name="gemma3:27b", authorization_token=None):
+    """
+    Generates a semi-synthetic Q&A dataset from text files in a directory using Ollama.
+
+    Args:
+        input_dir (str): The directory containing the text files.  Defaults to "../acquisition/temp/text_data".
+        base_url (str): The base URL for the Ollama API. Defaults to "http://localhost:11434/api/generate".
+        model_name (str): The name of the Ollama model to use. Defaults to "gemma3:27b".
+
+    Returns:
+        list: A list of dictionaries, where each dictionary represents a Q&A pair.
+              Returns an empty list if no files are found in the input directory or if there are errors during API calls.
+    """
+
+    qna_dataset = []
+    text_files = [f for f in os.listdir(input_dir) if f.endswith(".txt")]
+
+    if not text_files:
+        print(f"No .txt files found in {input_dir}")
+        return qna_dataset
+
+    for filename in text_files:
+        filepath = os.path.join(input_dir, filename)
+        try:
+            with open(filepath, "r", encoding="utf-8") as f:
+                document_content = f.read()
+        except Exception as e:
+            print(f"Error reading file {filename}: {e}")
+            continue
+
+        print("Generating semi-sythetic data based on: " + filename)
+
+        prompt = prompt.format(domain_of_expertise=model_expertise)
+
+        request_body = {
+            "model": model_name,
+            "prompt": prompt + "\n" + document_content,
+            "stream": False,
+            "images": None,
+            "options": None,
+            "format": {
+                "type": "object",
+                "properties": {
+                    "qnaList": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "properties": {
+                                "q": {"type": "string"},
+                                "a": {"type": "string"}
+                            }
+                        }
+                    }
+                },
+                "required": ["qnaList"]
+            }
+        }
+
+        headers = {"Content-Type": "application/json"}
+        if authorization_token:
+            headers["Authorization"] = f"Bearer {authorization_token}"
+
+        try:
+            response = requests.post(base_url, headers=headers, data=json.dumps(request_body))
+            response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
+            json_response = response.json()
+
+            if "response" in json_response and "qnaList" in json_response["response"]:
+                    qna_string = json_response['response']
+                    qna = json.loads(qna_string)
+                    qna_list = qna['qnaList']
+                    qna_dataset.extend(qna_list)
+            else:
+                print(f"Unexpected response format from Ollama for file {filename}: {json_response}")
+
+        except requests.exceptions.RequestException as e:
+            print(f"Error making request to Ollama for file {filename}: {e}")
+        except json.JSONDecodeError as e:
+            print(f"Error decoding JSON response from Ollama for file {filename}: {e}")
+
+    return qna_dataset
+
+def main(input_dir=None, config=None):
+    if(config==None):
+        config = configparser.ConfigParser()
+        config.read('config.ini')
+
+    model_expertise = config['DEFAULT']['model_expertise']
+    if input_dir == None:
+        input_dir = config['DEFAULT']['input_dir']
+    base_url = config['DEFAULT']['base_url']
+    model_name = config['DEFAULT']['model_name']
+    authorization_token = config['DEFAULT']['authorization_token']
+    dataset_prompt = config['DEFAULT']['dataset_prompt']
+
+    # Generate dataset
+    dataset = generate_qna_dataset(dataset_prompt, model_expertise, input_dir, base_url, model_name, authorization_token)
+
+    if dataset:
+        print(f"Generated {len(dataset)} Q&A pairs.")
+        with open("qna_dataset.json", "w", encoding="utf-8") as f:
+            json.dump(dataset, f, indent=4)
+        print("Q&A dataset saved to qna_dataset.json")
+    else:
+        print("Failed to generate Q&A dataset.")
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -1,14 +1,35 @@
 # Core dependencies for EMTP (Expert Model Training Pipeline)
 
-# URL retrieval dependencies
 ddgs>=0.1.0
-
-# Text processing dependencies
 language-tool-python>=2.9.4
-
-# PDF processing dependencies
 pypdf>=4.0.0
+requests==2.32.3
+aiohttp==3.11.18
+
+--extra-index-url https://download.pytorch.org/whl/cu128
+
+# Core ML Stack (Unchanged)
+torch==2.7.0+cu128
+torchvision==0.22.0+cu128
+torchaudio==2.7.0+cu128
+bitsandbytes==0.45.3
+evaluate==0.4.3
+peft==0.14.0
+pipdeptree==2.30.0
+sentencepiece==0.2.0
+tensorboard==2.19.0
+trl==0.15.2
+wheel==0.45.1
 
-# Requests
-requests
-aiohttp
+# --- STRICT DOWNSTREAM PINS - OPTIMISED FOR BLACKWELL ---
+# Pinning these ensures pip doesn't automatically upgrade them potentially breaking version compatibility.
+transformers==4.51.3
+datasets==3.3.2
+huggingface-hub==0.31.1
+numpy==2.1.2
+pandas==2.2.3
+accelerate==1.4.0
+safetensors==0.5.3
+pillow==11.0.0
+setuptools==70.2.0
+pyarrow==20.0.0
diff --git a/training/README.md b/training/README.md
@@ -15,6 +15,74 @@ This module is currently a placeholder for future development. It will consume p
 - Checkpointing and model saving
 - Integration with popular ML frameworks (PyTorch, TensorFlow, etc.)
 
-## Dependencies
+## Setup
 
-Training dependencies will be added to `requirements.txt` when this module is implemented.
+To convert a `.safetensors` file (commonly used for Hugging Face / PyTorch models like LLaMA) to the `gguf` format (used by [GGML-based](https://github.com/ggerganov/ggml) inference engines like `llama.cpp`), you generally need to follow these steps:
+
+---
+
+### 🔧 **Step-by-step Guide:**
+
+#### 1. **Install Python Environment (if not already set up)**
+
+Make sure you have Python 3.9+ and `pip` installed.
+
+#### 2. **Clone llama.cpp**
+
+```bash
+git clone https://github.com/ggerganov/llama.cpp
+cd llama.cpp
+```
+
+#### 3. **Install Dependencies**
+
+Install `transformers`, `safetensors`, and other required tools:
+
+```bash
+pip install -U transformers datasets safetensors
+```
+
+Some conversions also require `sentencepiece`. `accelerate` and `peft`:
+
+```bash
+pip install sentencepiece accelerate peft
+```
+
+#### 4. **Download/Prepare the Model**
+
+Make sure you have the full model directory (e.g., LLaMA, Mistral, etc.) in HF format, including `config.json`, `tokenizer.model`, and the `.safetensors` files.
+
+Example structure:
+```
+llama-2/
+├── config.json
+├── tokenizer.model
+├── model-00001-of-00002.safetensors
+├── model-00002-of-00002.safetensors
+├── tokenizer_config.json
+└── generation_config.json
+```
+
+#### 5. **Use `convert.py` to Convert**
+
+In the `llama.cpp` repo, use the built-in `convert.py` script:
+
+```bash
+python3 convert.py models/llama-2/ --outfile llama-2-f16.gguf
+```
+
+You can specify options like `--outtype f16` or `--outtype q4_0` for quantized output.
+
+```bash
+python /home/ay/github/project-engram/model-converter/llama.cpp/convert_hf_to_gguf.py /home/ay/github/project-engram/merged_model-Quality_Assurance-expert_model --outfile /home/ay/github/project-engram/model-converter/engrams/merged_model-Quality_Assurance-expert_model.gguf
+```
+
+---
+
+### ⚠️ Notes:
+
+- You **must use a model architecture supported** by `llama.cpp` (e.g., LLaMA, Mistral, Mixtral, Phi-2, Gemma). Others like GPT-J or Falcon won't work directly.
+- If you don't have the original Hugging Face format and only the `.safetensors` weights, you'll need the original config files as well.
+- If you're using a newer architecture (e.g., `Mixtral`, `Gemma`), check the latest instructions and `convert-*.py` scripts in the [`llama.cpp/scripts`](https://github.com/ggerganov/llama.cpp/tree/master/scripts) folder.
+
+---