Merge pull request #8 from ashiq-km/yml_workflow

ashiq-km · web-flow · commit 5ae06bfcae0f · 2025-12-07T02:19:02.000+05:30
Yml workflow
diff --git a/app/fastapi_app.py b/app/fastapi_app.py
@@ -1,4 +1,5 @@
-# We will create an API that has one job: take a word. look up the math, and return the similar words.
+# We will create an API that has one job:
+# take a word. look up the math, and return the similar words.
 
 
 from contextlib import asynccontextmanager
@@ -22,7 +23,8 @@ async def lifespan(app: FastAPI):
     print("Loading model...")
     if not config.MODEL_FILE.exists():
 
-        # In Production, you might want to download the model from S3 / DVC here
+        # In Production, you might want to
+        # download the model from S3 / DVC here
         raise FileNotFoundError("Model file not found. Run training first.")
 
     # Load the full model
@@ -51,7 +53,10 @@ async def lifespan(app: FastAPI):
 
 @app.get("/")
 def home():
-    return {"message": "Welcome to the Godfather API. Go to the /docs for testing."}
+    return {
+        "message": "Welcome to the Godfather API. \
+            Go to the /docs for testing."
+    }
 
 
 @app.get("/similar/{word}")
@@ -94,7 +99,9 @@ def get_similarity(w1: str, w2: str):
 
     if w1 not in model_wv or w2 not in model_wv:
         raise HTTPException(
-            status_code=404, detail="One of the words is missing from the vocabulary."
+            status_code=404,
+            detail="One of the words is \
+                missing from the vocabulary.",
         )
 
     score = model_wv.similarity(w1, w2)
diff --git a/app/streamlit_app.py b/app/streamlit_app.py
@@ -15,7 +15,7 @@
 st.title("🌹 The Godfather: Word Embeddings")
 st.markdown(
     """
-    Explore semantic relationships in the Godfather novel using AI.  
+    Explore semantic relationships in the Godfather novel using AI.
     🔍 Find similar words, perform analogies, and visualize relationships.
     """
 )
@@ -88,7 +88,10 @@ def download_and_load_model():
                 for w, score in similar:
                     st.progress(score, text=f"{w} ({score:.2f})")
             else:
-                st.warning(f"⚠️ The word '{word_input}' is not in the vocabulary.")
+                st.warning(
+                    f"⚠️ The word '{word_input}' \
+                           is not in the vocabulary."
+                )
 
 # --- TAB 2: ANALOGIES ---
 with tab2:
@@ -129,9 +132,9 @@ def download_and_load_model():
 st.sidebar.header("Godfather AI Controls")
 st.sidebar.markdown(
     """
-    - Use tabs to explore embeddings  
-    - Input words for similarity or analogies  
-    - Model automatically downloads if missing  
+    - Use tabs to explore embeddings
+    - Input words for similarity or analogies
+    - Model automatically downloads if missing
     - Works offline after first run
     """
 )
diff --git a/src/preprocess.py b/src/preprocess.py
@@ -24,7 +24,12 @@ def get_text_from_pdf(pdf_path):
 
         reader = pypdf.PdfReader(str(pdf_path))
 
-        for page in tqdm(reader.pages, desc=f"Reading {pdf_path.name}", leave=False):
+        for page in tqdm(
+            reader.pages,
+            desc=f"Reading \
+                         {pdf_path.name}",
+            leave=False,
+        ):
             page_text = page.extract_text()
 
             if page_text:
@@ -64,7 +69,8 @@ def main():
 
     all_sentences = []
 
-    for pdf_file in tqdm(config.RAW_DATA_FILES, desc="Processing PDFs", leave=False):
+    for pdf_file in tqdm(config.RAW_DATA_FILES,
+                          desc="Processing PDFs", leave=False):
         raw_text = get_text_from_pdf(pdf_file)
         sentences = clean_tokenize(raw_text)
         all_sentences.extend(sentences)
@@ -76,7 +82,8 @@ def main():
     print(f"Saving to {config.PROCESSED_DATA_FILE}...")
 
     with open(config.PROCESSED_DATA_FILE, "w", encoding="utf-8") as f:
-        for sentence in tqdm(all_sentences, desc="Writing sentences", leave=False):
+        for sentence in tqdm(all_sentences,
+                              desc="Writing sentences", leave=False):
             f.write(" ".join(sentence) + "\n")
 
     print("Preprocessing complete!")
diff --git a/src/train.py b/src/train.py
@@ -25,7 +25,8 @@ def train_model():
     # Check if data exists
     if not config.PROCESSED_DATA_FILE.exists():
         raise FileNotFoundError(
-            f"Processed data not found at {config.PROCESSED_DATA_FILE}. Run preprocess.py first."
+            f"Processed data not found at {config.PROCESSED_DATA_FILE}. \
+                Run preprocess.py first."
         )
 
     # Load sentences using LineSentence (memory efficient)

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,8 @@ def train_model():`
`25`	`25`	`# Check if data exists`
`26`	`26`	`if not config.PROCESSED_DATA_FILE.exists():`
`27`	`27`	`raise FileNotFoundError(`
`28`		`- f"Processed data not found at {config.PROCESSED_DATA_FILE}. Run preprocess.py first."`
	`28`	`+ f"Processed data not found at {config.PROCESSED_DATA_FILE}. \`
	`29`	`+ Run preprocess.py first."`
`29`	`30`	`)`
`30`	`31`
`31`	`32`	`# Load sentences using LineSentence (memory efficient)`