data-exp-lab
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backend/app/main.py‎
Lines changed: 59 additions & 3 deletions b/‎backend/app/main.py‎
Lines changed: 59 additions & 3 deletions
diff --git a/‎backend/app/services/ai_service.py‎
Lines changed: 3 additions & 3 deletions b/‎backend/app/services/ai_service.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎backend/app/services/gexy_node_service.py‎
Lines changed: 172 additions & 0 deletions b/‎backend/app/services/gexy_node_service.py‎
Lines changed: 172 additions & 0 deletions
diff --git a/‎backend/requirements.txt‎
Lines changed: 2 additions & 1 deletion b/‎backend/requirements.txt‎
Lines changed: 2 additions & 1 deletion
@@ -11,4 +11,5 @@ dist
 *temp
 *repo_metadata.json
 __pycache__
-*.duckdb
+*.duckdb
+*.gexf
@@ -1,12 +1,13 @@
-from flask import Flask, jsonify, request
+from flask import Flask, jsonify, request, send_file, url_for
 from flask_cors import CORS
 from services.topic_service import TopicService
 from services.ai_service import AITopicProcessor
+from services.gexy_node_service import GexfNodeGenerator
 import os
 import asyncio
 import re
 
-app = Flask(__name__)
+app = Flask(__name__, static_folder='gexf', static_url_path='/gexf')
 CORS(
     app,
     resources={
@@ -20,6 +21,7 @@
 
 topic_service = TopicService()
 ai_processor = AITopicProcessor()
+gexy_node_service = GexfNodeGenerator()
 
 
 @app.route("/api/process-topics", methods=["GET", "POST"])
@@ -91,7 +93,7 @@ def ai_process():
 def explain_topic():
     try:
         data = request.get_json()
-        print("Received explain-topic request with data:", {k: v for k, v in data.items() if k != 'apiKey'})  # Log data without API key
+        # print("Received explain-topic request with data:", {k: v for k, v in data.items() if k != 'apiKey'})  # Log data without API key
 
         topic = data.get("topic", "")
         search_term = data.get("searchTerm", "")
@@ -235,6 +237,60 @@ def suggest_topics():
         }), 500
 
 
+@app.route("/api/generated-node-gexf", methods=["POST"])
+def finalized_node_gexf():
+    data = request.get_json()
+    topics = data.get("topics", [])
+    gexf_path = gexy_node_service.generate_gexf_nodes_for_topics(topics)
+    # print(topics)
+    # Read the GEXF file content
+    with open(gexf_path, "r", encoding="utf-8") as f:
+        gexf_content = f.read()
+    
+    return jsonify({
+        "success": True,
+        "gexfContent": gexf_content
+    })
+
+
+@app.route("/api/get-unique-repos", methods=["POST"])
+def get_unique_repos():
+    try:
+        data = request.get_json()
+        topics = data.get("topics", [])
+        if not topics:
+            return jsonify({
+                "success": True,
+                "count": 0
+            })
+
+        # Convert topics to lowercase for case-insensitive matching
+        topics_lower = [t.lower() for t in topics]
+        placeholders = ",".join(["?"] * len(topics_lower))
+
+        # Query to get unique repositories that have ANY of the given topics
+        query = f"""
+            SELECT COUNT(DISTINCT r.nameWithOwner) as count
+            FROM repos r
+            JOIN repo_topics t ON r.nameWithOwner = t.repo
+            WHERE LOWER(t.topic) IN ({placeholders})
+        """
+        
+        result = topic_service.con.execute(query, topics_lower).fetchone()
+        count = result[0] if result else 0
+
+        return jsonify({
+            "success": True,
+            "count": count
+        })
+    except Exception as e:
+        print(f"Error getting unique repos: {str(e)}")
+        return jsonify({
+            "success": False,
+            "error": str(e)
+        }), 500
+
+
 @app.route("/")
 def home():
     return "Hello World!"
 
@@ -6,7 +6,7 @@
 import re
 
 # Configure logging
-logging.basicConfig(level=logging.DEBUG)
+# logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 
 class AITopicProcessor:
@@ -136,7 +136,7 @@ async def process_with_gemini(self, prompt: str, topics: List[str], search_term:
 """
 
             response = self.gemini_client.generate_content(full_prompt)
-            print("Raw response:", response.text)  # Debug print
+            # print("Raw response:", response.text)  # Debug print
 
             if response.text:
                 suggestions = response.text.strip().split("\n")
@@ -174,7 +174,7 @@ async def process_with_gemini(self, prompt: str, topics: List[str], search_term:
                             "topic": s.strip(),
                             "explanation": f"Suggested as relevant to {search_term}"
                         })
-                print("Processed suggestions:", processed_suggestions)  # Debug print
+                # print("Processed suggestions:", processed_suggestions)  # Debug print
                 return processed_suggestions
             return []
 
 
@@ -0,0 +1,172 @@
+import os
+import duckdb
+import psutil
+import networkx as nx
+from datetime import datetime
+import hashlib
+from pathlib import Path
+
+
+class GexfNodeGenerator:
+    def __init__(self):
+        self.save_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "gexf")
+        os.makedirs(self.save_dir, exist_ok=True)
+
+        # DuckDB connection (copied from TopicService)
+        db_path = os.path.join(
+            os.path.dirname(
+                os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+            ),
+            "public",
+            "data",
+            "github_meta.duckdb",
+        )
+        if os.path.exists(db_path):
+            self.con = duckdb.connect(database=db_path, read_only=True)
+            available_memory = psutil.virtual_memory().available
+            memory_limit = min(available_memory * 0.3, 0.5 * 1024 * 1024 * 1024)
+            self.con.execute(f"SET memory_limit TO '{int(memory_limit)}B'")
+            cpu_count = psutil.cpu_count(logical=False) or 1
+            thread_count = max(1, min(cpu_count, 2))
+            self.con.execute(f"SET threads TO {thread_count}")
+        else:
+            raise FileNotFoundError(
+                f"Database not found at {db_path}. Please ensure the database file exists before running the application."
+            )
+
+    def get_unique_filename(self, topics):
+        """Generate a unique filename based on the topics"""
+        # Sort topics to ensure consistent hash for same topics in different order
+        sorted_topics = sorted(topics)
+        # Create a hash of the topics
+        topics_str = "|".join(sorted_topics)
+        hash_object = hashlib.md5(topics_str.encode())
+        hash_hex = hash_object.hexdigest()[:12]  # Use first 12 characters of hash
+        # Include timestamp to ensure uniqueness even for same topics
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        return f"topics_{hash_hex}_{timestamp}.gexf"
+
+    def generate_gexf_nodes_for_topics(self, topics):
+        """
+        Generate and store a GEXF file for all repos containing any of the given topics.
+        Returns the path to the generated GEXF file.
+        """
+        if not topics:
+            return None
+
+        # Generate unique filename for this search
+        filename = self.get_unique_filename(topics)
+        gexf_path = os.path.join(self.save_dir, filename)
+
+        topics_lower = [t.lower() for t in topics]
+        placeholders = ",".join(["?"] * len(topics_lower))
+
+        query = f"""
+            WITH repo_topics_agg AS (
+                SELECT r.nameWithOwner, 
+                       GROUP_CONCAT(t.topic, '|') as topics
+                FROM repos r
+                JOIN repo_topics t ON r.nameWithOwner = t.repo
+                WHERE LOWER(t.topic) IN ({placeholders})
+                GROUP BY r.nameWithOwner
+            )
+            SELECT DISTINCT r.nameWithOwner, r.stars, r.forks, r.watchers, r.isFork, r.isArchived, 
+                           r.languageCount, r.pullRequests, r.issues, r.primaryLanguage, r.createdAt, 
+                           r.license, rt.topics
+            FROM repos r
+            JOIN repo_topics t ON r.nameWithOwner = t.repo
+            JOIN repo_topics_agg rt ON r.nameWithOwner = rt.nameWithOwner
+            WHERE LOWER(t.topic) IN ({placeholders})
+        """
+        result = self.con.execute(query, topics_lower + topics_lower).fetchall()
+        columns = [
+            "nameWithOwner",
+            "stars",
+            "forks",
+            "watchers",
+            "isFork",
+            "isArchived",
+            "languageCount",
+            "pullRequests",
+            "issues",
+            "primaryLanguage",
+            "createdAt",
+            "license",
+            "topics",
+        ]
+        G = nx.Graph()
+        G.graph['has_edges'] = False  # Add this attribute to indicate no edges in this graph
+
+        # Define default values for each column type
+        default_values = {
+            "stars": 0,
+            "forks": 0,
+            "watchers": 0,
+            "isFork": False,
+            "isArchived": False,
+            "languageCount": 0,
+            "pullRequests": 0,
+            "issues": 0,
+            "primaryLanguage": "",
+            "createdAt_year": 0,  # Keep only year
+            "license": "",
+            "topics": "",  # Default empty string for topics
+        }
+
+        # Add attributes to the graph
+        G.graph['node_attributes'] = {
+            'createdAt_year': {'type': 'integer'},  # Keep only year
+            'stars': {'type': 'integer'},
+            'forks': {'type': 'integer'},
+            'watchers': {'type': 'integer'},
+            'isFork': {'type': 'boolean'},
+            'isArchived': {'type': 'boolean'},
+            'languageCount': {'type': 'integer'},
+            'pullRequests': {'type': 'integer'},
+            'issues': {'type': 'integer'},
+            'primaryLanguage': {'type': 'string'},
+            'license': {'type': 'string'},
+            'github_url': {'type': 'string'},
+            'topics': {'type': 'string'},  # Add topics as a string attribute
+        }
+
+        for row in result:
+            node_attrs = {}
+            for col, val in zip(columns, row):
+                if col == "nameWithOwner":
+                    repo_name = val
+                    # Add GitHub URL using nameWithOwner
+                    node_attrs["github_url"] = f"https://github.com/{val}"
+                elif col == "createdAt":
+                    # Only extract year from the date
+                    if val:
+                        try:
+                            # Handle both string and datetime objects
+                            if isinstance(val, str):
+                                # Parse ISO format date (e.g., "2018-06-02T04:08:16Z")
+                                date = datetime.strptime(val.split('T')[0], "%Y-%m-%d")
+                            else:
+                                date = val  # Assume it's already a datetime object
+                            node_attrs["createdAt_year"] = date.year
+                        except (ValueError, TypeError) as e:
+                            print(f"Error processing date for {repo_name}: {e}")
+                            # If date parsing fails, use default value
+                            node_attrs["createdAt_year"] = 0
+                    else:
+                        node_attrs["createdAt_year"] = 0
+                elif col == "topics":
+                    # Store topics as a comma-separated string
+                    node_attrs[col] = val if val else default_values[col]
+                else:
+                    # Use default value if the value is None
+                    node_attrs[col] = default_values[col] if val is None else val
+            G.add_node(repo_name, **node_attrs)
+
+        # Print some statistics about the years
+        years = [attrs.get("createdAt_year", 0) for _, attrs in G.nodes(data=True)]
+        # print(f"Date statistics:")
+        # print(f"Years range: {min(years)} to {max(years)}")
+        # print(f"Number of nodes with year=0: {years.count(0)}")
+
+        nx.write_gexf(G, gexf_path)
+        return gexf_path  # Return the unique file path
@@ -8,4 +8,5 @@ uvicorn
 kaggle>=1.5.16
 numpy
 pandas
-psutil
+psutil
+networkx