Skip to content

Commit 8c48495

Browse files
committed
optimize query
1 parent daf1bc6 commit 8c48495

File tree

6 files changed

+289
-295
lines changed

6 files changed

+289
-295
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,5 @@ dist
1212
*repo_metadata.json
1313
__pycache__
1414
*.duckdb
15-
*.gexf
15+
*.gexf
16+
*.gexf.gz

backend/app/main.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,12 @@ def suggest_topics():
193193
# 2. Prioritizes exact matches and high-frequency topics
194194
# 3. Uses word boundary matching for better relevance
195195
sql_query = """
196-
WITH ranked_topics AS (
196+
WITH split_topics AS (
197+
SELECT
198+
unnest(string_split(topics, '|')) as topic
199+
FROM repo_topics
200+
),
201+
ranked_topics AS (
197202
SELECT
198203
topic,
199204
COUNT(*) as count,
@@ -202,7 +207,7 @@ def suggest_topics():
202207
WHEN LOWER(topic) LIKE ? THEN 2 -- Starts with query gets second priority
203208
ELSE 1 -- Contains query gets lowest priority
204209
END as match_priority
205-
FROM repo_topics
210+
FROM split_topics
206211
WHERE LOWER(topic) LIKE ?
207212
GROUP BY topic
208213
)
@@ -246,6 +251,13 @@ def finalized_node_gexf():
246251
topics = data.get("topics", [])
247252
gexf_path = gexf_node_service.generate_gexf_nodes_for_topics(topics)
248253
# print(topics)
254+
255+
if gexf_path is None:
256+
return jsonify({
257+
"success": False,
258+
"error": "No repositories found for the given topics"
259+
}), 404
260+
249261
# Read the GEXF file content
250262
with open(gexf_path, "r", encoding="utf-8") as f:
251263
gexf_content = f.read()
@@ -463,14 +475,16 @@ def get_unique_repos():
463475
placeholders = ",".join(["?"] * len(topics_lower))
464476

465477
# Query to get unique repositories that have ANY of the given topics
466-
query = f"""
478+
# Create a single search pattern that matches any of the topics
479+
search_pattern = '%' + '%'.join(topics_lower) + '%'
480+
query = """
467481
SELECT COUNT(DISTINCT r.nameWithOwner) as count
468482
FROM repos r
469483
JOIN repo_topics t ON r.nameWithOwner = t.repo
470-
WHERE LOWER(t.topic) IN ({placeholders})
484+
WHERE LOWER(t.topics) LIKE ?
471485
"""
472486

473-
result = topic_service.con.execute(query, topics_lower).fetchone()
487+
result = topic_service.con.execute(query, [search_pattern]).fetchone()
474488
count = result[0] if result else 0
475489

476490
return jsonify({

backend/app/services/edge_generation_service.py

Lines changed: 40 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,12 @@ def generate_edges_with_criteria(
108108
edge_stats['criteria_used'] = [k for k, v in criteria_config.items() if v]
109109
edge_stats['combination_logic_applied'] = True
110110

111+
# Debug information
112+
print(f"Generated {len(all_edges)} total edges")
113+
print(f"Final edges after combination logic: {len(final_edges)}")
114+
print(f"Edges in graph: {total_edges}")
115+
print(f"Edge stats: {edge_stats}")
116+
111117
return G, edge_stats
112118

113119
def _get_repos_for_topics(self, topics: List[str]) -> List[Dict]:
@@ -118,12 +124,17 @@ def _get_repos_for_topics(self, topics: List[str]) -> List[Dict]:
118124
topics_lower = [t.lower() for t in topics]
119125
placeholders = ",".join(["?"] * len(topics_lower))
120126

127+
# Create a more flexible search pattern using OR conditions
128+
conditions = []
129+
for topic in topics_lower:
130+
conditions.append(f"LOWER(t.topics) LIKE '%{topic}%'")
131+
121132
query = f"""
122133
WITH matching_repos AS (
123134
SELECT DISTINCT r.nameWithOwner
124135
FROM repos r
125136
JOIN repo_topics t ON r.nameWithOwner = t.repo
126-
WHERE LOWER(t.topic) IN ({placeholders})
137+
WHERE ({" OR ".join(conditions)})
127138
),
128139
repo_data AS (
129140
SELECT
@@ -136,22 +147,19 @@ def _get_repos_for_topics(self, topics: List[str]) -> List[Dict]:
136147
r.pullRequests,
137148
r.issues,
138149
r.primaryLanguage,
139-
r.createdAt,
150+
r.createdAt_year,
140151
r.license,
141152
r.bigquery_contributors,
142153
r.bigquery_stargazers,
143-
GROUP_CONCAT(t.topic, '|') AS topics
154+
t.topics
144155
FROM repos r
145156
JOIN repo_topics t ON r.nameWithOwner = t.repo
146157
JOIN matching_repos mr ON r.nameWithOwner = mr.nameWithOwner
147-
GROUP BY r.nameWithOwner, r.stars, r.forks, r.watchers, r.isArchived,
148-
r.languageCount, r.pullRequests, r.issues, r.primaryLanguage,
149-
r.createdAt, r.license, r.bigquery_contributors, r.bigquery_stargazers
150158
)
151159
SELECT * FROM repo_data
152160
"""
153161

154-
result = self.con.execute(query, topics_lower).fetchall()
162+
result = self.con.execute(query).fetchall()
155163

156164
columns = [
157165
"nameWithOwner", "stars", "forks", "watchers", "isArchived",
@@ -188,18 +196,27 @@ def _extract_year(self, date_val) -> int:
188196
if not date_val:
189197
return 0
190198
try:
191-
if isinstance(date_val, str):
199+
if isinstance(date_val, int):
200+
return date_val
201+
elif isinstance(date_val, str):
192202
date = datetime.strptime(date_val.split('T')[0], "%Y-%m-%d")
203+
return date.year
193204
else:
194-
date = date_val
195-
return date.year
205+
return date_val.year
196206
except (ValueError, TypeError):
197207
return 0
198208

199209
def _format_list_data(self, data) -> str:
200210
"""Format list data as comma-separated string."""
201-
if data and isinstance(data, list):
211+
if not data:
212+
return ""
213+
if isinstance(data, list):
202214
return ",".join(data)
215+
elif isinstance(data, str):
216+
# Handle string representation of list
217+
if data.startswith('[') and data.endswith(']'):
218+
# Remove brackets and split by comma
219+
return data[1:-1]
203220
return ""
204221

205222
def _generate_topic_based_edges(self, G: nx.Graph, repos: List[Dict]) -> List[Tuple]:
@@ -424,6 +441,18 @@ def save_graph_with_edges(self, G: nx.Graph, output_path: str):
424441
G.graph['has_edges'] = True
425442
G.graph['edge_generation_criteria'] = 'Multiple criteria combination'
426443

444+
# Ensure edge attributes are properly set
445+
for u, v, data in G.edges(data=True):
446+
# Convert complex data structures to strings for GEXF compatibility
447+
if 'shared_topics' in data and isinstance(data['shared_topics'], list):
448+
data['shared_topics'] = '|'.join(data['shared_topics'])
449+
if 'shared_contributors' in data and isinstance(data['shared_contributors'], list):
450+
data['shared_contributors'] = '|'.join(data['shared_contributors'])
451+
if 'shared_stargazers' in data and isinstance(data['shared_stargazers'], list):
452+
data['shared_stargazers'] = '|'.join(data['shared_stargazers'])
453+
if 'criteria_satisfied' in data and isinstance(data['criteria_satisfied'], list):
454+
data['criteria_satisfied'] = '|'.join(data['criteria_satisfied'])
455+
427456
# Write to GEXF file
428457
nx.write_gexf(G, output_path)
429458
return output_path

0 commit comments

Comments
 (0)