skills + technologies => only skills (no redundancy)

HardMax71 · HardMax71 · commit bfd38ada58f6 · 2025-09-29T00:27:40.000+02:00
diff --git a/backend/.env.example b/backend/.env.example
@@ -39,7 +39,7 @@ REDIS_JOB_PREFIX=cv:job:
 REDIS_JOB_QUEUE=cv_processing_queue
 REDIS_CLEANUP_QUEUE=cv_cleanup_queue
 REDIS_JOB_TIMEOUT=1800
-REDIS_WORKER_TIMEOUT=30
+REDIS_WORKER_TIMEOUT=3
 REDIS_MAX_RETRIES=3
 
 # -----------------------------------------------------------------------------
diff --git a/backend/README.md b/backend/README.md
@@ -21,7 +21,7 @@ No need to reinvent auth, migrations, or admin panels when Django gives you all
 
 **Processor app** - This is where the magic starts. Upload a resume, it queues a job in Redis, extracts text with pdfplumber or OCR, sends it to an LLM (OpenAI's GPT-4o-mini) to structure it properly, then generates a review of what could be improved. The structured data is what makes everything else possible.
 
-**Storage app** - Once we have structured data, it goes two places. Neo4j stores the actual resume data as a graph (people connected to companies, skills, technologies), while Qdrant stores embedding vectors for semantic search. The graph gives us relationship queries ("who worked at Google?"), vectors give us semantic similarity ("find someone like this person").
+**Storage app** - Once we have structured data, it goes two places. Neo4j stores the actual resume data as a graph (people connected to companies and skills), while Qdrant stores embedding vectors for semantic search. The graph gives us relationship queries ("who worked at Google?"), vectors give us semantic similarity ("find someone like this person").
 
 **Search app** - This is where it all comes together. You can do semantic search (vector similarity), structured search (graph queries), or hybrid search (both combined). The results show not just who matched, but exactly which parts of their resume matched and why.
 
diff --git a/backend/core/domain/__init__.py b/backend/core/domain/__init__.py
@@ -29,7 +29,6 @@
     Resume,
     ScientificContribution,
     Skill,
-    Technology,
     WorkAuthorization,
     WorkMode,
 )
@@ -60,7 +59,6 @@
     "EmploymentDuration",
     "CompanyInfo",
     "KeyPoint",
-    "Technology",
     "Skill",
     "Project",
     "InstitutionInfo",
diff --git a/backend/core/domain/resume.py b/backend/core/domain/resume.py
@@ -12,7 +12,6 @@ class EmbeddingVector(BaseModel):
     email: str | None = None
     # Searchable metadata fields
     skills: list[str] = Field(default_factory=list)
-    technologies: list[str] = Field(default_factory=list)
     companies: list[str] = Field(default_factory=list)
     role: str | None = None
     location: str | None = None
@@ -129,10 +128,6 @@ class KeyPoint(BaseModel):
     text: str
 
 
-class Technology(BaseModel):
-    name: str
-
-
 class Skill(BaseModel):
     name: str
 
@@ -145,15 +140,17 @@ class EmploymentHistoryItem(BaseModel):
     duration: EmploymentDuration
     location: Location | None = None
     key_points: list[KeyPoint] = Field(default_factory=list)
-    technologies: list[Technology] = Field(default_factory=list)
+    skills: list[Skill] = Field(default_factory=list)
 
     @model_validator(mode="before")
     @classmethod
     def accept_legacy_employment(cls, v: dict):
         if "company" in v and isinstance(v["company"], str):
             v["company"] = {"name": v["company"]}
-        if "tech_stack" in v and "technologies" not in v:
-            v["technologies"] = v.pop("tech_stack")
+        if "tech_stack" in v and "skills" not in v:
+            v["skills"] = v.pop("tech_stack")
+        if "technologies" in v and "skills" not in v:
+            v["skills"] = v.pop("technologies")
         if "start_date" in v or "end_date" in v or "date_format" in v or "duration_months" in v:
             v["duration"] = {
                 "date_format": v.pop("date_format", "MM.YYYY"),
@@ -163,24 +160,26 @@ def accept_legacy_employment(cls, v: dict):
             }
         if "key_points" in v:
             v["key_points"] = [kp if isinstance(kp, dict) else {"text": kp} for kp in v["key_points"]]
-        if "technologies" in v:
-            v["technologies"] = [t if isinstance(t, dict) else {"name": t} for t in v["technologies"]]
+        if "skills" in v:
+            v["skills"] = [s if isinstance(s, dict) else {"name": s} for s in v["skills"]]
         return v
 
 
 class Project(BaseModel):
     title: str
     url: str | None = None
-    technologies: list[Technology] = Field(default_factory=list)
+    skills: list[Skill] = Field(default_factory=list)
     key_points: list[KeyPoint] = Field(default_factory=list)
 
     @model_validator(mode="before")
     @classmethod
     def accept_legacy_project(cls, v: dict):
-        if "tech_stack" in v and "technologies" not in v:
-            v["technologies"] = v.pop("tech_stack")
-        if "technologies" in v:
-            v["technologies"] = [t if isinstance(t, dict) else {"name": t} for t in v["technologies"]]
+        if "tech_stack" in v and "skills" not in v:
+            v["skills"] = v.pop("tech_stack")
+        if "technologies" in v and "skills" not in v:
+            v["skills"] = v.pop("technologies")
+        if "skills" in v:
+            v["skills"] = [s if isinstance(s, dict) else {"name": s} for s in v["skills"]]
         if "key_points" in v:
             v["key_points"] = [kp if isinstance(kp, dict) else {"text": kp} for kp in v["key_points"]]
         return v
@@ -322,10 +321,14 @@ def years_of_experience(self) -> float:
     def has_skill(self, skill: str) -> bool:
         return any(skill.lower() == s.name.lower() for s in self.skills)
 
-    def get_technologies(self) -> set[str]:
-        techs: set[str] = set()
+    def get_all_skills(self) -> set[str]:
+        all_skills: set[str] = set()
+        # Skills from main skills list
+        all_skills.update(s.name for s in self.skills)
+        # Skills from employment history
         for emp in self.employment_history:
-            techs.update(t.name for t in emp.technologies)
+            all_skills.update(s.name for s in emp.skills)
+        # Skills from projects
         for proj in self.projects:
-            techs.update(t.name for t in proj.technologies)
-        return techs
+            all_skills.update(s.name for s in proj.skills)
+        return all_skills
diff --git a/backend/core/domain/search.py b/backend/core/domain/search.py
@@ -11,7 +11,6 @@ class SearchType(StrEnum):
 @dataclass
 class SearchFilters:
     skills: list[str] | None = None
-    technologies: list[str] | None = None
     role: str | None = None
     company: str | None = None
     location: str | None = None
@@ -27,7 +26,6 @@ class FilterOption:
 @dataclass
 class FilterOptionsResult:
     skills: list[FilterOption] = field(default_factory=list)
-    technologies: list[FilterOption] = field(default_factory=list)
     roles: list[FilterOption] = field(default_factory=list)
     companies: list[FilterOption] = field(default_factory=list)
     locations: list[FilterOption] = field(default_factory=list)
diff --git a/backend/core/model_registry.py b/backend/core/model_registry.py
@@ -27,7 +27,6 @@
     Resume,
     ScientificContribution,
     Skill,
-    Technology,
     WorkAuthorization,
 )
 from storage.neo4j_models import (
@@ -55,7 +54,6 @@
     ResumeNode,
     ScientificContributionNode,
     SkillNode,
-    TechnologyNode,
     WorkAuthorizationNode,
 )
 
@@ -84,7 +82,6 @@ def initialize(cls) -> None:
             (EmploymentDuration, EmploymentDurationNode),
             (CompanyInfo, CompanyInfoNode),
             (KeyPoint, KeyPointInfoNode),
-            (Technology, TechnologyNode),
             (Skill, SkillNode),
             (Project, ProjectNode),
             (InstitutionInfo, InstitutionInfoNode),
diff --git a/backend/core/tests.py b/backend/core/tests.py
@@ -23,7 +23,6 @@
     SearchRequest,
     SearchType,
     Skill,
-    Technology,
     VectorHit,
     WorkMode,
 )
@@ -64,9 +63,9 @@ def setUp(self) -> None:
             KeyPoint(text="Led development of resume matching pipeline"),
             KeyPoint(text="Shipped embeddings search service at scale"),
         ]
-        technologies = [
-            Technology(name="Django"),
-            Technology(name="PostgreSQL"),
+        skills_used = [
+            Skill(name="Django"),
+            Skill(name="PostgreSQL"),
         ]
         history_payload = {
             "position": "Senior Backend Engineer",
@@ -76,7 +75,7 @@ def setUp(self) -> None:
             "duration": duration.model_dump(mode="json"),
             "location": location.model_dump(mode="json"),
             "key_points": [kp.model_dump(mode="json") for kp in key_points],
-            "technologies": [tech.model_dump(mode="json") for tech in technologies],
+            "skills": [skill.model_dump(mode="json") for skill in skills_used],
         }
         history_item = EmploymentHistoryItem.model_validate(history_payload)
 
@@ -99,7 +98,7 @@ def test_resume_serialization_includes_nested_fields(self) -> None:
         history_entry = payload["employment_history"][0]
         self.assertEqual(history_entry["company"]["name"], "AI Labs")
         self.assertEqual(history_entry["duration"]["duration_months"], 42)
-        self.assertEqual(history_entry["technologies"][0]["name"], "Django")
+        self.assertEqual(history_entry["skills"][0]["name"], "Django")
 
         profile = payload["professional_profile"]
         self.assertEqual(profile["preferences"]["role"], "Backend Engineer")
@@ -109,7 +108,7 @@ def test_resume_helper_methods(self) -> None:
         self.assertAlmostEqual(self.resume.years_of_experience(), 3.5)
         self.assertTrue(self.resume.has_skill("python"))
         self.assertFalse(self.resume.has_skill("Go"))
-        self.assertEqual(self.resume.get_technologies(), {"Django", "PostgreSQL"})
+        self.assertEqual(self.resume.get_all_skills(), {"Python", "Django", "PostgreSQL"})
 
 
 class SearchModelBehaviourTests(TestCase):
diff --git a/backend/processor/services/content_structure_service.py b/backend/processor/services/content_structure_service.py
@@ -54,9 +54,9 @@ def _prepare_prompt(self, text: str, links: list[dict]) -> str:
 
 Validation Guardrails:
 - Company names must match exactly (case-sensitive)
-- Skills only from explicit skills sections
+- Skills include ALL technical and soft capabilities (Python, React, Docker, Leadership, etc.)
 - Links must exist in original document
-- Tech stack only from explicit "Stack:" or equivalent section
+- Extract skills from ANY mention: skills sections, tech stacks, tool lists, technologies used
 
 SECTION-SPECIFIC RULES:
 
@@ -71,7 +71,7 @@ def _prepare_prompt(self, text: str, links: list[dict]) -> str:
 - DO NOT include education (degrees, university attendance) as employment history.
 - Education (Bachelor's, Master's, PhD, etc.) must ONLY go in the education section, never in employment_history.
 - Responsibilities: Use exact bullet points verbatim.
-- Tech stack: Extract only from explicit "Stack:" or equivalent section.
+- Skills: Extract ALL technologies, tools, frameworks, languages mentioned (Stack:, Technologies:, Tools:, etc.)
 - If a starting month is not explicitly mentioned (e.g., "2022 - Present"), ASSUME "01.2022 - Present".
 
 EDUCATION:
@@ -85,6 +85,7 @@ def _prepare_prompt(self, text: str, links: list[dict]) -> str:
 PROJECTS:
 - Only include personal projects that are explicitly stated as pet projects or were completed outside of employment.
 - DO NOT duplicate any project details already present in the employment_history section.
+- Skills: Extract ALL technologies, tools, frameworks mentioned in each project
 - If no qualifying projects are mentioned, set "projects" to null.
 
 LANGUAGE PROFICIENCY:
diff --git a/backend/processor/services/processing_service.py b/backend/processor/services/processing_service.py
@@ -203,15 +203,15 @@ def _generate_embeddings_from_resume(self, resume: Resume) -> list[EmbeddingVect
         embeddings = self.embedding_service.encode_batch(texts)
 
         # Extract metadata for search filtering (all lists are guaranteed to exist)
-        all_techs = [tech.name for emp in resume.employment_history for tech in emp.technologies]
-        all_techs.extend(tech.name for proj in resume.projects for tech in proj.technologies)
+        all_skills = [s.name for s in resume.skills]
+        all_skills.extend(s.name for emp in resume.employment_history for s in emp.skills)
+        all_skills.extend(s.name for proj in resume.projects for s in proj.skills)
 
         # Only include fields that EmbeddingVector expects
         vector_metadata = {
             "name": resume.personal_info.name,
             "email": resume.personal_info.contact.email,
-            "skills": [s.name for s in resume.skills],
-            "technologies": list(set(all_techs)),  # dedupe
+            "skills": list(set(all_skills)),  # dedupe all skills from everywhere
             "companies": list({emp.company.name for emp in resume.employment_history if emp.company}),
             "role": resume.professional_profile.preferences.role
             if resume.professional_profile and resume.professional_profile.preferences
diff --git a/backend/search/serializers.py b/backend/search/serializers.py
@@ -62,10 +62,10 @@ class SearchFiltersSerializer(serializers.Serializer):
     """Serializer for search filter parameters."""
 
     skills = serializers.ListField(
-        child=serializers.CharField(), required=False, allow_null=True, help_text="Skills to filter by"
-    )
-    technologies = serializers.ListField(
-        child=serializers.CharField(), required=False, allow_null=True, help_text="Technologies to filter by"
+        child=serializers.CharField(),
+        required=False,
+        allow_null=True,
+        help_text="Skills to filter by (includes technologies)",
     )
     role = serializers.CharField(required=False, allow_null=True, help_text="Desired role to filter by")
     company = serializers.CharField(required=False, allow_null=True, help_text="Company to filter by")
@@ -173,8 +173,7 @@ class FilterOptionSerializer(serializers.Serializer):
 
 
 class FilterOptionsSerializer(serializers.Serializer):
-    skills = FilterOptionSerializer(many=True, default=list, help_text="Available skills")
-    technologies = FilterOptionSerializer(many=True, default=list, help_text="Available technologies")
+    skills = FilterOptionSerializer(many=True, default=list, help_text="Available skills (includes technologies)")
     roles = FilterOptionSerializer(many=True, default=list, help_text="Available roles")
     companies = FilterOptionSerializer(many=True, default=list, help_text="Available companies")
     locations = FilterOptionSerializer(many=True, default=list, help_text="Available locations")
diff --git a/backend/search/services/graph_search.py b/backend/search/services/graph_search.py
@@ -42,13 +42,9 @@ def search(
             ($skills IS NULL OR $skills = [] OR EXISTS {
                 MATCH (resume)-[:HAS_SKILL]->(s:SkillNode)
                 WHERE s.name IN $skills
-            })
-            AND ($technologies IS NULL OR $technologies = [] OR EXISTS {
-                MATCH (resume)-[:HAS_EMPLOYMENT_HISTORY|HAS_PROJECT]->(entity)
-                WHERE EXISTS {
-                    MATCH (entity)-[:USES_TECHNOLOGY]->(t:TechnologyNode)
-                    WHERE t.name IN $technologies
-                }
+            } OR EXISTS {
+                MATCH (resume)-[:HAS_EMPLOYMENT_HISTORY|HAS_PROJECT]->(entity)-[:HAS_SKILL]->(s:SkillNode)
+                WHERE s.name IN $skills
             })
             AND ($role IS NULL OR EXISTS {
                 MATCH (resume)-[:HAS_PROFESSIONAL_PROFILE]->(pp:ProfessionalProfileNode)
@@ -176,22 +172,22 @@ def get_resumes_by_ids(self, resume_ids: list[str]) -> list[ResumeSearchResult]:
     def get_filter_options(self) -> FilterOptionsResult:
         query = """
         CALL {
-            MATCH (s:SkillNode)<-[:HAS_SKILL]-(resume:ResumeNode)
-            WITH s.name AS value, count(DISTINCT resume) AS count
+            MATCH path = (s:SkillNode)<-[:HAS_SKILL]-(entity)
+            WHERE (entity:ResumeNode) OR
+                  ((entity:EmploymentHistoryItemNode) AND (entity)<-[:HAS_EMPLOYMENT_HISTORY]-(:ResumeNode)) OR
+                  ((entity:ProjectNode) AND (entity)<-[:HAS_PROJECT]-(:ResumeNode))
+            WITH s.name AS value,
+                 CASE
+                   WHEN entity:ResumeNode THEN entity
+                   WHEN entity:EmploymentHistoryItemNode THEN head([(entity)<-[:HAS_EMPLOYMENT_HISTORY]-(r:ResumeNode) | r])
+                   WHEN entity:ProjectNode THEN head([(entity)<-[:HAS_PROJECT]-(r:ResumeNode) | r])
+                 END AS resume
+            WITH value, count(DISTINCT resume) AS count
             WHERE count > 0
-            WITH value, count
             ORDER BY count DESC, value ASC
-            LIMIT 100
+            LIMIT 200
             RETURN 'skills' AS category, collect({value: value, count: count}) AS items
             UNION
-            MATCH (t:TechnologyNode)<-[:USES_TECHNOLOGY]-()<-[:HAS_EMPLOYMENT_HISTORY|HAS_PROJECT]-(resume:ResumeNode)
-            WITH t.name AS value, count(DISTINCT resume) AS count
-            WHERE count > 0
-            WITH value, count
-            ORDER BY count DESC, value ASC
-            LIMIT 100
-            RETURN 'technologies' AS category, collect({value: value, count: count}) AS items
-            UNION
             MATCH (resume:ResumeNode)-[:HAS_PROFESSIONAL_PROFILE]->(:ProfessionalProfileNode)-[:HAS_PREFERENCES]->(pref:PreferencesNode)
             WHERE pref.role IS NOT NULL AND pref.role <> ""
             WITH pref.role AS value, count(DISTINCT resume) AS count
diff --git a/backend/storage/neo4j_models.py b/backend/storage/neo4j_models.py
@@ -130,15 +130,11 @@ class KeyPointInfoNode(StructuredNode):
     text = StringProperty(required=True)
 
 
-class TechnologyNode(StructuredNode):
-    name = StringProperty(required=True, unique_index=True)
-
-
 class EmploymentHistoryItemNode(StructuredNode):
     position = StringProperty(required=True)
     employment_type = StringProperty(required=True, choices=EMPLOYMENT_TYPE_CHOICES)
     work_mode = StringProperty(required=True, choices=WORK_MODE_CHOICES)
-    technologies = RelationshipTo("TechnologyNode", "USES_TECHNOLOGY")
+    skills = RelationshipTo("SkillNode", "HAS_SKILL")
 
     company = RelationshipTo("CompanyInfoNode", "WORKED_AT")
     duration = RelationshipTo("EmploymentDurationNode", "HAS_DURATION")
@@ -149,7 +145,7 @@ class EmploymentHistoryItemNode(StructuredNode):
 class ProjectNode(StructuredNode):
     title = StringProperty(required=True)
     url = StringProperty()
-    technologies = RelationshipTo("TechnologyNode", "USES_TECHNOLOGY")
+    skills = RelationshipTo("SkillNode", "HAS_SKILL")
     key_points = RelationshipTo("KeyPointInfoNode", "HAS_KEY_POINT")
 
 
diff --git a/backend/storage/services/vector_db_service.py b/backend/storage/services/vector_db_service.py
@@ -50,7 +50,6 @@ def _ensure_collection(self) -> None:
             "source",
             "email",
             "skills",
-            "technologies",
             "companies",
             "role",
             "location",
@@ -115,7 +114,6 @@ def store_vectors(self, resume_id: str, vectors: list[EmbeddingVector]) -> list[
                 "email": v.email,
                 # Add searchable metadata
                 "skills": v.skills,
-                "technologies": v.technologies,
                 "companies": v.companies,
                 "role": v.role,
                 "location": v.location,
diff --git a/backend/test_script.py b/backend/test_script.py
diff --git a/frontend/src/pages/JobStatus.tsx b/frontend/src/pages/JobStatus.tsx
diff --git a/frontend/src/pages/Landing.tsx b/frontend/src/pages/Landing.tsx