Skip to content

Commit bf41e33

Browse files
authored
Merge pull request #17 from data-exp-lab/ui
Bridge Topics and Graph Preparation for deploy branch.
2 parents 45020eb + ec7d660 commit bf41e33

22 files changed

+983
-239571
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@ dist
1111
*temp
1212
*repo_metadata.json
1313
__pycache__
14-
*.duckdb
14+
*.duckdb
15+
*.gexf

backend/app/main.py

Lines changed: 59 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
1-
from flask import Flask, jsonify, request
1+
from flask import Flask, jsonify, request, send_file, url_for
22
from flask_cors import CORS
33
from services.topic_service import TopicService
44
from services.ai_service import AITopicProcessor
5+
from services.gexy_node_service import GexfNodeGenerator
56
import os
67
import asyncio
78
import re
89

9-
app = Flask(__name__)
10+
app = Flask(__name__, static_folder='gexf', static_url_path='/gexf')
1011
CORS(
1112
app,
1213
resources={
@@ -20,6 +21,7 @@
2021

2122
topic_service = TopicService()
2223
ai_processor = AITopicProcessor()
24+
gexy_node_service = GexfNodeGenerator()
2325

2426

2527
@app.route("/api/process-topics", methods=["GET", "POST"])
@@ -91,7 +93,7 @@ def ai_process():
9193
def explain_topic():
9294
try:
9395
data = request.get_json()
94-
print("Received explain-topic request with data:", {k: v for k, v in data.items() if k != 'apiKey'}) # Log data without API key
96+
# print("Received explain-topic request with data:", {k: v for k, v in data.items() if k != 'apiKey'}) # Log data without API key
9597

9698
topic = data.get("topic", "")
9799
search_term = data.get("searchTerm", "")
@@ -235,6 +237,60 @@ def suggest_topics():
235237
}), 500
236238

237239

240+
@app.route("/api/generated-node-gexf", methods=["POST"])
241+
def finalized_node_gexf():
242+
data = request.get_json()
243+
topics = data.get("topics", [])
244+
gexf_path = gexy_node_service.generate_gexf_nodes_for_topics(topics)
245+
# print(topics)
246+
# Read the GEXF file content
247+
with open(gexf_path, "r", encoding="utf-8") as f:
248+
gexf_content = f.read()
249+
250+
return jsonify({
251+
"success": True,
252+
"gexfContent": gexf_content
253+
})
254+
255+
256+
@app.route("/api/get-unique-repos", methods=["POST"])
257+
def get_unique_repos():
258+
try:
259+
data = request.get_json()
260+
topics = data.get("topics", [])
261+
if not topics:
262+
return jsonify({
263+
"success": True,
264+
"count": 0
265+
})
266+
267+
# Convert topics to lowercase for case-insensitive matching
268+
topics_lower = [t.lower() for t in topics]
269+
placeholders = ",".join(["?"] * len(topics_lower))
270+
271+
# Query to get unique repositories that have ANY of the given topics
272+
query = f"""
273+
SELECT COUNT(DISTINCT r.nameWithOwner) as count
274+
FROM repos r
275+
JOIN repo_topics t ON r.nameWithOwner = t.repo
276+
WHERE LOWER(t.topic) IN ({placeholders})
277+
"""
278+
279+
result = topic_service.con.execute(query, topics_lower).fetchone()
280+
count = result[0] if result else 0
281+
282+
return jsonify({
283+
"success": True,
284+
"count": count
285+
})
286+
except Exception as e:
287+
print(f"Error getting unique repos: {str(e)}")
288+
return jsonify({
289+
"success": False,
290+
"error": str(e)
291+
}), 500
292+
293+
238294
@app.route("/")
239295
def home():
240296
return "Hello World!"

backend/app/services/ai_service.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import re
77

88
# Configure logging
9-
logging.basicConfig(level=logging.DEBUG)
9+
# logging.basicConfig(level=logging.DEBUG)
1010
logger = logging.getLogger(__name__)
1111

1212
class AITopicProcessor:
@@ -136,7 +136,7 @@ async def process_with_gemini(self, prompt: str, topics: List[str], search_term:
136136
"""
137137

138138
response = self.gemini_client.generate_content(full_prompt)
139-
print("Raw response:", response.text) # Debug print
139+
# print("Raw response:", response.text) # Debug print
140140

141141
if response.text:
142142
suggestions = response.text.strip().split("\n")
@@ -174,7 +174,7 @@ async def process_with_gemini(self, prompt: str, topics: List[str], search_term:
174174
"topic": s.strip(),
175175
"explanation": f"Suggested as relevant to {search_term}"
176176
})
177-
print("Processed suggestions:", processed_suggestions) # Debug print
177+
# print("Processed suggestions:", processed_suggestions) # Debug print
178178
return processed_suggestions
179179
return []
180180

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
import os
2+
import duckdb
3+
import psutil
4+
import networkx as nx
5+
from datetime import datetime
6+
import hashlib
7+
from pathlib import Path
8+
9+
10+
class GexfNodeGenerator:
11+
def __init__(self):
12+
self.save_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "gexf")
13+
os.makedirs(self.save_dir, exist_ok=True)
14+
15+
# DuckDB connection (copied from TopicService)
16+
db_path = os.path.join(
17+
os.path.dirname(
18+
os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
19+
),
20+
"public",
21+
"data",
22+
"github_meta.duckdb",
23+
)
24+
if os.path.exists(db_path):
25+
self.con = duckdb.connect(database=db_path, read_only=True)
26+
available_memory = psutil.virtual_memory().available
27+
memory_limit = min(available_memory * 0.3, 0.5 * 1024 * 1024 * 1024)
28+
self.con.execute(f"SET memory_limit TO '{int(memory_limit)}B'")
29+
cpu_count = psutil.cpu_count(logical=False) or 1
30+
thread_count = max(1, min(cpu_count, 2))
31+
self.con.execute(f"SET threads TO {thread_count}")
32+
else:
33+
raise FileNotFoundError(
34+
f"Database not found at {db_path}. Please ensure the database file exists before running the application."
35+
)
36+
37+
def get_unique_filename(self, topics):
38+
"""Generate a unique filename based on the topics"""
39+
# Sort topics to ensure consistent hash for same topics in different order
40+
sorted_topics = sorted(topics)
41+
# Create a hash of the topics
42+
topics_str = "|".join(sorted_topics)
43+
hash_object = hashlib.md5(topics_str.encode())
44+
hash_hex = hash_object.hexdigest()[:12] # Use first 12 characters of hash
45+
# Include timestamp to ensure uniqueness even for same topics
46+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
47+
return f"topics_{hash_hex}_{timestamp}.gexf"
48+
49+
def generate_gexf_nodes_for_topics(self, topics):
50+
"""
51+
Generate and store a GEXF file for all repos containing any of the given topics.
52+
Returns the path to the generated GEXF file.
53+
"""
54+
if not topics:
55+
return None
56+
57+
# Generate unique filename for this search
58+
filename = self.get_unique_filename(topics)
59+
gexf_path = os.path.join(self.save_dir, filename)
60+
61+
topics_lower = [t.lower() for t in topics]
62+
placeholders = ",".join(["?"] * len(topics_lower))
63+
64+
query = f"""
65+
WITH repo_topics_agg AS (
66+
SELECT r.nameWithOwner,
67+
GROUP_CONCAT(t.topic, '|') as topics
68+
FROM repos r
69+
JOIN repo_topics t ON r.nameWithOwner = t.repo
70+
WHERE LOWER(t.topic) IN ({placeholders})
71+
GROUP BY r.nameWithOwner
72+
)
73+
SELECT DISTINCT r.nameWithOwner, r.stars, r.forks, r.watchers, r.isFork, r.isArchived,
74+
r.languageCount, r.pullRequests, r.issues, r.primaryLanguage, r.createdAt,
75+
r.license, rt.topics
76+
FROM repos r
77+
JOIN repo_topics t ON r.nameWithOwner = t.repo
78+
JOIN repo_topics_agg rt ON r.nameWithOwner = rt.nameWithOwner
79+
WHERE LOWER(t.topic) IN ({placeholders})
80+
"""
81+
result = self.con.execute(query, topics_lower + topics_lower).fetchall()
82+
columns = [
83+
"nameWithOwner",
84+
"stars",
85+
"forks",
86+
"watchers",
87+
"isFork",
88+
"isArchived",
89+
"languageCount",
90+
"pullRequests",
91+
"issues",
92+
"primaryLanguage",
93+
"createdAt",
94+
"license",
95+
"topics",
96+
]
97+
G = nx.Graph()
98+
G.graph['has_edges'] = False # Add this attribute to indicate no edges in this graph
99+
100+
# Define default values for each column type
101+
default_values = {
102+
"stars": 0,
103+
"forks": 0,
104+
"watchers": 0,
105+
"isFork": False,
106+
"isArchived": False,
107+
"languageCount": 0,
108+
"pullRequests": 0,
109+
"issues": 0,
110+
"primaryLanguage": "",
111+
"createdAt_year": 0, # Keep only year
112+
"license": "",
113+
"topics": "", # Default empty string for topics
114+
}
115+
116+
# Add attributes to the graph
117+
G.graph['node_attributes'] = {
118+
'createdAt_year': {'type': 'integer'}, # Keep only year
119+
'stars': {'type': 'integer'},
120+
'forks': {'type': 'integer'},
121+
'watchers': {'type': 'integer'},
122+
'isFork': {'type': 'boolean'},
123+
'isArchived': {'type': 'boolean'},
124+
'languageCount': {'type': 'integer'},
125+
'pullRequests': {'type': 'integer'},
126+
'issues': {'type': 'integer'},
127+
'primaryLanguage': {'type': 'string'},
128+
'license': {'type': 'string'},
129+
'github_url': {'type': 'string'},
130+
'topics': {'type': 'string'}, # Add topics as a string attribute
131+
}
132+
133+
for row in result:
134+
node_attrs = {}
135+
for col, val in zip(columns, row):
136+
if col == "nameWithOwner":
137+
repo_name = val
138+
# Add GitHub URL using nameWithOwner
139+
node_attrs["github_url"] = f"https://github.com/{val}"
140+
elif col == "createdAt":
141+
# Only extract year from the date
142+
if val:
143+
try:
144+
# Handle both string and datetime objects
145+
if isinstance(val, str):
146+
# Parse ISO format date (e.g., "2018-06-02T04:08:16Z")
147+
date = datetime.strptime(val.split('T')[0], "%Y-%m-%d")
148+
else:
149+
date = val # Assume it's already a datetime object
150+
node_attrs["createdAt_year"] = date.year
151+
except (ValueError, TypeError) as e:
152+
print(f"Error processing date for {repo_name}: {e}")
153+
# If date parsing fails, use default value
154+
node_attrs["createdAt_year"] = 0
155+
else:
156+
node_attrs["createdAt_year"] = 0
157+
elif col == "topics":
158+
# Store topics as a comma-separated string
159+
node_attrs[col] = val if val else default_values[col]
160+
else:
161+
# Use default value if the value is None
162+
node_attrs[col] = default_values[col] if val is None else val
163+
G.add_node(repo_name, **node_attrs)
164+
165+
# Print some statistics about the years
166+
years = [attrs.get("createdAt_year", 0) for _, attrs in G.nodes(data=True)]
167+
# print(f"Date statistics:")
168+
# print(f"Years range: {min(years)} to {max(years)}")
169+
# print(f"Number of nodes with year=0: {years.count(0)}")
170+
171+
nx.write_gexf(G, gexf_path)
172+
return gexf_path # Return the unique file path

backend/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@ uvicorn
88
kaggle>=1.5.16
99
numpy
1010
pandas
11-
psutil
11+
psutil
12+
networkx

0 commit comments

Comments
 (0)