Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
113 changes: 113 additions & 0 deletions dataset/enrichment/qa_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import os
import configparser
import requests
import json

def generate_qna_dataset(prompt="You are an expert in {domain_of_expertise}. Generate questions and answers based on the text content you are provided with." ,model_expertise="Software Engineering", input_dir="../acquisition/temp/text_data", base_url="http://localhost:8080/api/generate", model_name="gemma3:27b", authorization_token=None):
"""
Generates a semi-synthetic Q&A dataset from text files in a directory using Ollama.

Args:
input_dir (str): The directory containing the text files. Defaults to "../acquisition/temp/text_data".
base_url (str): The base URL for the Ollama API. Defaults to "http://localhost:11434/api/generate".
model_name (str): The name of the Ollama model to use. Defaults to "gemma3:27b".

Returns:
list: A list of dictionaries, where each dictionary represents a Q&A pair.
Returns an empty list if no files are found in the input directory or if there are errors during API calls.
"""

qna_dataset = []
text_files = [f for f in os.listdir(input_dir) if f.endswith(".txt")]

if not text_files:
print(f"No .txt files found in {input_dir}")
return qna_dataset

for filename in text_files:
filepath = os.path.join(input_dir, filename)
try:
with open(filepath, "r", encoding="utf-8") as f:
document_content = f.read()
except Exception as e:
print(f"Error reading file {filename}: {e}")
continue

print("Generating semi-sythetic data based on: " + filename)

prompt = prompt.format(domain_of_expertise=model_expertise)

request_body = {
"model": model_name,
"prompt": prompt + "\n" + document_content,
"stream": False,
"images": None,
"options": None,
"format": {
"type": "object",
"properties": {
"qnaList": {
"type": "array",
"items": {
"type": "object",
"properties": {
"q": {"type": "string"},
"a": {"type": "string"}
}
}
}
},
"required": ["qnaList"]
}
}

headers = {"Content-Type": "application/json"}
if authorization_token:
headers["Authorization"] = f"Bearer {authorization_token}"

try:
response = requests.post(base_url, headers=headers, data=json.dumps(request_body))
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
json_response = response.json()

if "response" in json_response and "qnaList" in json_response["response"]:
qna_string = json_response['response']
qna = json.loads(qna_string)
qna_list = qna['qnaList']
qna_dataset.extend(qna_list)
else:
print(f"Unexpected response format from Ollama for file {filename}: {json_response}")

except requests.exceptions.RequestException as e:
print(f"Error making request to Ollama for file {filename}: {e}")
except json.JSONDecodeError as e:
print(f"Error decoding JSON response from Ollama for file {filename}: {e}")

return qna_dataset

def main(input_dir=None, config=None):
if(config==None):
config = configparser.ConfigParser()
config.read('config.ini')

model_expertise = config['DEFAULT']['model_expertise']
if input_dir == None:
input_dir = config['DEFAULT']['input_dir']
base_url = config['DEFAULT']['base_url']
model_name = config['DEFAULT']['model_name']
authorization_token = config['DEFAULT']['authorization_token']
dataset_prompt = config['DEFAULT']['dataset_prompt']

# Generate dataset
dataset = generate_qna_dataset(dataset_prompt, model_expertise, input_dir, base_url, model_name, authorization_token)

if dataset:
print(f"Generated {len(dataset)} Q&A pairs.")
with open("qna_dataset.json", "w", encoding="utf-8") as f:
json.dump(dataset, f, indent=4)
print("Q&A dataset saved to qna_dataset.json")
else:
print("Failed to generate Q&A dataset.")

if __name__ == "__main__":
main()
37 changes: 29 additions & 8 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,35 @@
# Core dependencies for EMTP (Expert Model Training Pipeline)

# URL retrieval dependencies
ddgs>=0.1.0

# Text processing dependencies
language-tool-python>=2.9.4

# PDF processing dependencies
pypdf>=4.0.0
requests==2.32.3

Check warning

Code scanning / Trivy

requests: Requests vulnerable to .netrc credentials leak via malicious URLs Medium

Package: requests
Installed Version: 2.32.3
Vulnerability CVE-2024-47081
Severity: MEDIUM
Fixed Version: 2.32.4
Link: CVE-2024-47081
aiohttp==3.11.18

Check failure

Code scanning / Trivy

aiohttp: AIOHTTP's HTTP Parser auto_decompress feature is vulnerable to zip bomb High

Package: aiohttp
Installed Version: 3.11.18
Vulnerability CVE-2025-69223
Severity: HIGH
Fixed Version: 3.13.3
Link: CVE-2025-69223

Check warning

Code scanning / Trivy

aiohttp: AIOHTTP: Denial of Service via excessive CPU usage in chunked message handling Medium

Package: aiohttp
Installed Version: 3.11.18
Vulnerability CVE-2025-69229
Severity: MEDIUM
Fixed Version: 3.13.3
Link: CVE-2025-69229

Check warning

Code scanning / Trivy

aiohttp: aiohttp: Denial of Service via memory exhaustion from crafted POST request Medium

Package: aiohttp
Installed Version: 3.11.18
Vulnerability CVE-2025-69228
Severity: MEDIUM
Fixed Version: 3.13.3
Link: CVE-2025-69228

Check warning

Code scanning / Trivy

aiohttp: aiohttp: Denial of Service via specially crafted POST request Medium

Package: aiohttp
Installed Version: 3.11.18
Vulnerability CVE-2025-69227
Severity: MEDIUM
Fixed Version: 3.13.3
Link: CVE-2025-69227

Check notice

Code scanning / Trivy

aiohttp: aiohttp: Denial of Service via specially crafted invalid cookies Low

Package: aiohttp
Installed Version: 3.11.18
Vulnerability CVE-2025-69230
Severity: LOW
Fixed Version: 3.13.3
Link: CVE-2025-69230

Check notice

Code scanning / Trivy

aiohttp: aiohttp: Information disclosure of path components via static file path normalization Low

Package: aiohttp
Installed Version: 3.11.18
Vulnerability CVE-2025-69226
Severity: LOW
Fixed Version: 3.13.3
Link: CVE-2025-69226

Check notice

Code scanning / Trivy

aiohttp: aiohttp: Request smuggling vulnerability via non-ASCII decimals in Range header Low

Package: aiohttp
Installed Version: 3.11.18
Vulnerability CVE-2025-69225
Severity: LOW
Fixed Version: 3.13.3
Link: CVE-2025-69225

Check notice

Code scanning / Trivy

aiohttp: aiohttp: Request smuggling via non-ASCII characters in HTTP parser Low

Package: aiohttp
Installed Version: 3.11.18
Vulnerability CVE-2025-69224
Severity: LOW
Fixed Version: 3.13.3
Link: CVE-2025-69224

Check notice

Code scanning / Trivy

aiohttp: AIOHTTP HTTP Request/Response Smuggling Low

Package: aiohttp
Installed Version: 3.11.18
Vulnerability CVE-2025-53643
Severity: LOW
Fixed Version: 3.12.14
Link: CVE-2025-53643

--extra-index-url https://download.pytorch.org/whl/cu128

# Core ML Stack (Unchanged)
torch==2.7.0+cu128

Check notice

Code scanning / Trivy

A vulnerability, which was classified as problematic, was found in PyT ... Low

Package: torch
Installed Version: 2.7.0+cu128
Vulnerability CVE-2025-3730
Severity: MEDIUM
Fixed Version: 2.8.0
Link: CVE-2025-3730

Check notice

Code scanning / Trivy

torch: PyTorch torch.mkldnn_max_pool2d denial of service Low

Package: torch
Installed Version: 2.7.0+cu128
Vulnerability CVE-2025-2953
Severity: LOW
Fixed Version: 2.7.1-rc1
Link: CVE-2025-2953
torchvision==0.22.0+cu128
torchaudio==2.7.0+cu128
bitsandbytes==0.45.3
evaluate==0.4.3
peft==0.14.0
pipdeptree==2.30.0
sentencepiece==0.2.0

Check failure

Code scanning / Trivy

sentencepiece: Sentencepiece: Invalid memory access leading to potential arbitrary code execution via a crafted model file. High

Package: sentencepiece
Installed Version: 0.2.0
Vulnerability CVE-2026-1260
Severity: HIGH
Fixed Version: 0.2.1
Link: CVE-2026-1260
tensorboard==2.19.0
trl==0.15.2
wheel==0.45.1

Check failure

Code scanning / Trivy

wheel: wheel: Privilege Escalation or Arbitrary Code Execution via malicious wheel file unpacking High

Package: wheel
Installed Version: 0.45.1
Vulnerability CVE-2026-24049
Severity: HIGH
Fixed Version: 0.46.2
Link: CVE-2026-24049

# Requests
requests
aiohttp
# --- STRICT DOWNSTREAM PINS - OPTIMISED FOR BLACKWELL ---
# Pinning these ensures pip doesn't automatically upgrade them potentially breaking version compatibility.
transformers==4.51.3

Check warning

Code scanning / Trivy

transformers: Regular Expression Denial of Service (ReDoS) in huggingface/transformers Medium

Package: transformers
Installed Version: 4.51.3
Vulnerability CVE-2025-6921
Severity: MEDIUM
Fixed Version: 4.53.0
Link: CVE-2025-6921

Check warning

Code scanning / Trivy

transformers: Regular Expression Denial of Service (ReDoS) in huggingface/transformers Medium

Package: transformers
Installed Version: 4.51.3
Vulnerability CVE-2025-6638
Severity: MEDIUM
Fixed Version: 4.53.0
Link: CVE-2025-6638

Check warning

Code scanning / Trivy

transformers: Regular Expression Denial of Service (ReDoS) in huggingface/transformers Medium

Package: transformers
Installed Version: 4.51.3
Vulnerability CVE-2025-6051
Severity: MEDIUM
Fixed Version: 4.53.0
Link: CVE-2025-6051

Check warning

Code scanning / Trivy

transformers: Transformers ReDoS Vulnerability Medium

Package: transformers
Installed Version: 4.51.3
Vulnerability CVE-2025-5197
Severity: MEDIUM
Fixed Version: 4.53.0
Link: CVE-2025-5197

Check warning

Code scanning / Trivy

transformers: Regular Expression Denial of Service (ReDoS) in huggingface/transformers Medium

Package: transformers
Installed Version: 4.51.3
Vulnerability CVE-2025-3933
Severity: MEDIUM
Fixed Version: 4.52.1
Link: CVE-2025-3933

Check notice

Code scanning / Trivy

transformers: Improper Input Validation in huggingface/transformers Low

Package: transformers
Installed Version: 4.51.3
Vulnerability CVE-2025-3777
Severity: LOW
Fixed Version: 4.52.1
Link: CVE-2025-3777
datasets==3.3.2
huggingface-hub==0.31.1
numpy==2.1.2
pandas==2.2.3
accelerate==1.4.0
safetensors==0.5.3
pillow==11.0.0

Check failure

Code scanning / Trivy

pillow: Pillow: Out-of-bounds Write via Specially Crafted PSD Image High

Package: pillow
Installed Version: 11.0.0
Vulnerability CVE-2026-25990
Severity: HIGH
Fixed Version: 12.1.1
Link: CVE-2026-25990
setuptools==70.2.0

Check failure

Code scanning / Trivy

setuptools: Path Traversal Vulnerability in setuptools PackageIndex High

Package: setuptools
Installed Version: 70.2.0
Vulnerability CVE-2025-47273
Severity: HIGH
Fixed Version: 78.1.1
Link: CVE-2025-47273
pyarrow==20.0.0
72 changes: 70 additions & 2 deletions training/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,74 @@ This module is currently a placeholder for future development. It will consume p
- Checkpointing and model saving
- Integration with popular ML frameworks (PyTorch, TensorFlow, etc.)

## Dependencies
## Setup

Training dependencies will be added to `requirements.txt` when this module is implemented.
To convert a `.safetensors` file (commonly used for Hugging Face / PyTorch models like LLaMA) to the `gguf` format (used by [GGML-based](https://github.com/ggerganov/ggml) inference engines like `llama.cpp`), you generally need to follow these steps:

---

### 🔧 **Step-by-step Guide:**

#### 1. **Install Python Environment (if not already set up)**

Make sure you have Python 3.9+ and `pip` installed.

#### 2. **Clone llama.cpp**

```bash
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
```

#### 3. **Install Dependencies**

Install `transformers`, `safetensors`, and other required tools:

```bash
pip install -U transformers datasets safetensors
```

Some conversions also require `sentencepiece`. `accelerate` and `peft`:

```bash
pip install sentencepiece accelerate peft
```

#### 4. **Download/Prepare the Model**

Make sure you have the full model directory (e.g., LLaMA, Mistral, etc.) in HF format, including `config.json`, `tokenizer.model`, and the `.safetensors` files.

Example structure:
```
llama-2/
├── config.json
├── tokenizer.model
├── model-00001-of-00002.safetensors
├── model-00002-of-00002.safetensors
├── tokenizer_config.json
└── generation_config.json
```

#### 5. **Use `convert.py` to Convert**

In the `llama.cpp` repo, use the built-in `convert.py` script:

```bash
python3 convert.py models/llama-2/ --outfile llama-2-f16.gguf
```

You can specify options like `--outtype f16` or `--outtype q4_0` for quantized output.

```bash
python /home/ay/github/project-engram/model-converter/llama.cpp/convert_hf_to_gguf.py /home/ay/github/project-engram/merged_model-Quality_Assurance-expert_model --outfile /home/ay/github/project-engram/model-converter/engrams/merged_model-Quality_Assurance-expert_model.gguf
```

---

### ⚠️ Notes:

- You **must use a model architecture supported** by `llama.cpp` (e.g., LLaMA, Mistral, Mixtral, Phi-2, Gemma). Others like GPT-J or Falcon won't work directly.
- If you don't have the original Hugging Face format and only the `.safetensors` weights, you'll need the original config files as well.
- If you're using a newer architecture (e.g., `Mixtral`, `Gemma`), check the latest instructions and `convert-*.py` scripts in the [`llama.cpp/scripts`](https://github.com/ggerganov/llama.cpp/tree/master/scripts) folder.

---
Loading
Loading