data-exp-lab
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backend/app/main.py‎
Lines changed: 19 additions & 5 deletions b/‎backend/app/main.py‎
Lines changed: 19 additions & 5 deletions
diff --git a/‎backend/app/services/edge_generation_service.py‎
Lines changed: 40 additions & 11 deletions b/‎backend/app/services/edge_generation_service.py‎
Lines changed: 40 additions & 11 deletions
@@ -12,4 +12,5 @@ dist
 *repo_metadata.json
 __pycache__
 *.duckdb
-*.gexf
+*.gexf
+*.gexf.gz
@@ -193,7 +193,12 @@ def suggest_topics():
         # 2. Prioritizes exact matches and high-frequency topics
         # 3. Uses word boundary matching for better relevance
         sql_query = """
-            WITH ranked_topics AS (
+            WITH split_topics AS (
+                SELECT 
+                    unnest(string_split(topics, '|')) as topic
+                FROM repo_topics
+            ),
+            ranked_topics AS (
                 SELECT 
                     topic,
                     COUNT(*) as count,
@@ -202,7 +207,7 @@ def suggest_topics():
                         WHEN LOWER(topic) LIKE ? THEN 2  -- Starts with query gets second priority
                         ELSE 1  -- Contains query gets lowest priority
                     END as match_priority
-                FROM repo_topics
+                FROM split_topics
                 WHERE LOWER(topic) LIKE ?
                 GROUP BY topic
             )
@@ -246,6 +251,13 @@ def finalized_node_gexf():
     topics = data.get("topics", [])
     gexf_path = gexf_node_service.generate_gexf_nodes_for_topics(topics)
     # print(topics)
+    
+    if gexf_path is None:
+        return jsonify({
+            "success": False,
+            "error": "No repositories found for the given topics"
+        }), 404
+    
     # Read the GEXF file content
     with open(gexf_path, "r", encoding="utf-8") as f:
         gexf_content = f.read()
@@ -463,14 +475,16 @@ def get_unique_repos():
         placeholders = ",".join(["?"] * len(topics_lower))
 
         # Query to get unique repositories that have ANY of the given topics
-        query = f"""
+        # Create a single search pattern that matches any of the topics
+        search_pattern = '%' + '%'.join(topics_lower) + '%'
+        query = """
             SELECT COUNT(DISTINCT r.nameWithOwner) as count
             FROM repos r
             JOIN repo_topics t ON r.nameWithOwner = t.repo
-            WHERE LOWER(t.topic) IN ({placeholders})
+            WHERE LOWER(t.topics) LIKE ?
         """
 
-        result = topic_service.con.execute(query, topics_lower).fetchone()
+        result = topic_service.con.execute(query, [search_pattern]).fetchone()
         count = result[0] if result else 0
 
         return jsonify({
 
@@ -108,6 +108,12 @@ def generate_edges_with_criteria(
         edge_stats['criteria_used'] = [k for k, v in criteria_config.items() if v]
         edge_stats['combination_logic_applied'] = True
 
+        # Debug information
+        print(f"Generated {len(all_edges)} total edges")
+        print(f"Final edges after combination logic: {len(final_edges)}")
+        print(f"Edges in graph: {total_edges}")
+        print(f"Edge stats: {edge_stats}")
+        
         return G, edge_stats
 
     def _get_repos_for_topics(self, topics: List[str]) -> List[Dict]:
@@ -118,12 +124,17 @@ def _get_repos_for_topics(self, topics: List[str]) -> List[Dict]:
         topics_lower = [t.lower() for t in topics]
         placeholders = ",".join(["?"] * len(topics_lower))
 
+        # Create a more flexible search pattern using OR conditions
+        conditions = []
+        for topic in topics_lower:
+            conditions.append(f"LOWER(t.topics) LIKE '%{topic}%'")
+        
         query = f"""
             WITH matching_repos AS (
                 SELECT DISTINCT r.nameWithOwner
                 FROM repos r
                 JOIN repo_topics t ON r.nameWithOwner = t.repo
-                WHERE LOWER(t.topic) IN ({placeholders})
+                WHERE ({" OR ".join(conditions)})
             ),
             repo_data AS (
                 SELECT 
@@ -136,22 +147,19 @@ def _get_repos_for_topics(self, topics: List[str]) -> List[Dict]:
                     r.pullRequests,
                     r.issues,
                     r.primaryLanguage,
-                    r.createdAt,
+                    r.createdAt_year,
                     r.license,
                     r.bigquery_contributors,
                     r.bigquery_stargazers,
-                    GROUP_CONCAT(t.topic, '|') AS topics
+                    t.topics
                 FROM repos r
                 JOIN repo_topics t ON r.nameWithOwner = t.repo
                 JOIN matching_repos mr ON r.nameWithOwner = mr.nameWithOwner
-                GROUP BY r.nameWithOwner, r.stars, r.forks, r.watchers, r.isArchived, 
-                         r.languageCount, r.pullRequests, r.issues, r.primaryLanguage, 
-                         r.createdAt, r.license, r.bigquery_contributors, r.bigquery_stargazers
             )
             SELECT * FROM repo_data
         """
 
-        result = self.con.execute(query, topics_lower).fetchall()
+        result = self.con.execute(query).fetchall()
 
         columns = [
             "nameWithOwner", "stars", "forks", "watchers", "isArchived",
@@ -188,18 +196,27 @@ def _extract_year(self, date_val) -> int:
         if not date_val:
             return 0
         try:
-            if isinstance(date_val, str):
+            if isinstance(date_val, int):
+                return date_val
+            elif isinstance(date_val, str):
                 date = datetime.strptime(date_val.split('T')[0], "%Y-%m-%d")
+                return date.year
             else:
-                date = date_val
-            return date.year
+                return date_val.year
         except (ValueError, TypeError):
             return 0
 
     def _format_list_data(self, data) -> str:
         """Format list data as comma-separated string."""
-        if data and isinstance(data, list):
+        if not data:
+            return ""
+        if isinstance(data, list):
             return ",".join(data)
+        elif isinstance(data, str):
+            # Handle string representation of list
+            if data.startswith('[') and data.endswith(']'):
+                # Remove brackets and split by comma
+                return data[1:-1]
         return ""
 
     def _generate_topic_based_edges(self, G: nx.Graph, repos: List[Dict]) -> List[Tuple]:
@@ -424,6 +441,18 @@ def save_graph_with_edges(self, G: nx.Graph, output_path: str):
         G.graph['has_edges'] = True
         G.graph['edge_generation_criteria'] = 'Multiple criteria combination'
 
+        # Ensure edge attributes are properly set
+        for u, v, data in G.edges(data=True):
+            # Convert complex data structures to strings for GEXF compatibility
+            if 'shared_topics' in data and isinstance(data['shared_topics'], list):
+                data['shared_topics'] = '|'.join(data['shared_topics'])
+            if 'shared_contributors' in data and isinstance(data['shared_contributors'], list):
+                data['shared_contributors'] = '|'.join(data['shared_contributors'])
+            if 'shared_stargazers' in data and isinstance(data['shared_stargazers'], list):
+                data['shared_stargazers'] = '|'.join(data['shared_stargazers'])
+            if 'criteria_satisfied' in data and isinstance(data['criteria_satisfied'], list):
+                data['criteria_satisfied'] = '|'.join(data['criteria_satisfied'])
+        
         # Write to GEXF file
         nx.write_gexf(G, output_path)
         return output_path