|
| 1 | +import os |
| 2 | +import duckdb |
| 3 | +import psutil |
| 4 | +import networkx as nx |
| 5 | +from datetime import datetime |
| 6 | +import hashlib |
| 7 | +from pathlib import Path |
| 8 | + |
| 9 | + |
| 10 | +class GexfNodeGenerator: |
| 11 | + def __init__(self): |
| 12 | + self.save_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "gexf") |
| 13 | + os.makedirs(self.save_dir, exist_ok=True) |
| 14 | + |
| 15 | + # DuckDB connection (copied from TopicService) |
| 16 | + db_path = os.path.join( |
| 17 | + os.path.dirname( |
| 18 | + os.path.dirname(os.path.dirname(os.path.dirname(__file__))) |
| 19 | + ), |
| 20 | + "public", |
| 21 | + "data", |
| 22 | + "github_meta.duckdb", |
| 23 | + ) |
| 24 | + if os.path.exists(db_path): |
| 25 | + self.con = duckdb.connect(database=db_path, read_only=True) |
| 26 | + available_memory = psutil.virtual_memory().available |
| 27 | + memory_limit = min(available_memory * 0.3, 0.5 * 1024 * 1024 * 1024) |
| 28 | + self.con.execute(f"SET memory_limit TO '{int(memory_limit)}B'") |
| 29 | + cpu_count = psutil.cpu_count(logical=False) or 1 |
| 30 | + thread_count = max(1, min(cpu_count, 2)) |
| 31 | + self.con.execute(f"SET threads TO {thread_count}") |
| 32 | + else: |
| 33 | + raise FileNotFoundError( |
| 34 | + f"Database not found at {db_path}. Please ensure the database file exists before running the application." |
| 35 | + ) |
| 36 | + |
| 37 | + def get_unique_filename(self, topics): |
| 38 | + """Generate a unique filename based on the topics""" |
| 39 | + # Sort topics to ensure consistent hash for same topics in different order |
| 40 | + sorted_topics = sorted(topics) |
| 41 | + # Create a hash of the topics |
| 42 | + topics_str = "|".join(sorted_topics) |
| 43 | + hash_object = hashlib.md5(topics_str.encode()) |
| 44 | + hash_hex = hash_object.hexdigest()[:12] # Use first 12 characters of hash |
| 45 | + # Include timestamp to ensure uniqueness even for same topics |
| 46 | + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| 47 | + return f"topics_{hash_hex}_{timestamp}.gexf" |
| 48 | + |
| 49 | + def generate_gexf_nodes_for_topics(self, topics): |
| 50 | + """ |
| 51 | + Generate and store a GEXF file for all repos containing any of the given topics. |
| 52 | + Returns the path to the generated GEXF file. |
| 53 | + """ |
| 54 | + if not topics: |
| 55 | + return None |
| 56 | + |
| 57 | + # Generate unique filename for this search |
| 58 | + filename = self.get_unique_filename(topics) |
| 59 | + gexf_path = os.path.join(self.save_dir, filename) |
| 60 | + |
| 61 | + topics_lower = [t.lower() for t in topics] |
| 62 | + placeholders = ",".join(["?"] * len(topics_lower)) |
| 63 | + |
| 64 | + query = f""" |
| 65 | + WITH repo_topics_agg AS ( |
| 66 | + SELECT r.nameWithOwner, |
| 67 | + GROUP_CONCAT(t.topic, '|') as topics |
| 68 | + FROM repos r |
| 69 | + JOIN repo_topics t ON r.nameWithOwner = t.repo |
| 70 | + WHERE LOWER(t.topic) IN ({placeholders}) |
| 71 | + GROUP BY r.nameWithOwner |
| 72 | + ) |
| 73 | + SELECT DISTINCT r.nameWithOwner, r.stars, r.forks, r.watchers, r.isFork, r.isArchived, |
| 74 | + r.languageCount, r.pullRequests, r.issues, r.primaryLanguage, r.createdAt, |
| 75 | + r.license, rt.topics |
| 76 | + FROM repos r |
| 77 | + JOIN repo_topics t ON r.nameWithOwner = t.repo |
| 78 | + JOIN repo_topics_agg rt ON r.nameWithOwner = rt.nameWithOwner |
| 79 | + WHERE LOWER(t.topic) IN ({placeholders}) |
| 80 | + """ |
| 81 | + result = self.con.execute(query, topics_lower + topics_lower).fetchall() |
| 82 | + columns = [ |
| 83 | + "nameWithOwner", |
| 84 | + "stars", |
| 85 | + "forks", |
| 86 | + "watchers", |
| 87 | + "isFork", |
| 88 | + "isArchived", |
| 89 | + "languageCount", |
| 90 | + "pullRequests", |
| 91 | + "issues", |
| 92 | + "primaryLanguage", |
| 93 | + "createdAt", |
| 94 | + "license", |
| 95 | + "topics", |
| 96 | + ] |
| 97 | + G = nx.Graph() |
| 98 | + G.graph['has_edges'] = False # Add this attribute to indicate no edges in this graph |
| 99 | + |
| 100 | + # Define default values for each column type |
| 101 | + default_values = { |
| 102 | + "stars": 0, |
| 103 | + "forks": 0, |
| 104 | + "watchers": 0, |
| 105 | + "isFork": False, |
| 106 | + "isArchived": False, |
| 107 | + "languageCount": 0, |
| 108 | + "pullRequests": 0, |
| 109 | + "issues": 0, |
| 110 | + "primaryLanguage": "", |
| 111 | + "createdAt_year": 0, # Keep only year |
| 112 | + "license": "", |
| 113 | + "topics": "", # Default empty string for topics |
| 114 | + } |
| 115 | + |
| 116 | + # Add attributes to the graph |
| 117 | + G.graph['node_attributes'] = { |
| 118 | + 'createdAt_year': {'type': 'integer'}, # Keep only year |
| 119 | + 'stars': {'type': 'integer'}, |
| 120 | + 'forks': {'type': 'integer'}, |
| 121 | + 'watchers': {'type': 'integer'}, |
| 122 | + 'isFork': {'type': 'boolean'}, |
| 123 | + 'isArchived': {'type': 'boolean'}, |
| 124 | + 'languageCount': {'type': 'integer'}, |
| 125 | + 'pullRequests': {'type': 'integer'}, |
| 126 | + 'issues': {'type': 'integer'}, |
| 127 | + 'primaryLanguage': {'type': 'string'}, |
| 128 | + 'license': {'type': 'string'}, |
| 129 | + 'github_url': {'type': 'string'}, |
| 130 | + 'topics': {'type': 'string'}, # Add topics as a string attribute |
| 131 | + } |
| 132 | + |
| 133 | + for row in result: |
| 134 | + node_attrs = {} |
| 135 | + for col, val in zip(columns, row): |
| 136 | + if col == "nameWithOwner": |
| 137 | + repo_name = val |
| 138 | + # Add GitHub URL using nameWithOwner |
| 139 | + node_attrs["github_url"] = f"https://github.com/{val}" |
| 140 | + elif col == "createdAt": |
| 141 | + # Only extract year from the date |
| 142 | + if val: |
| 143 | + try: |
| 144 | + # Handle both string and datetime objects |
| 145 | + if isinstance(val, str): |
| 146 | + # Parse ISO format date (e.g., "2018-06-02T04:08:16Z") |
| 147 | + date = datetime.strptime(val.split('T')[0], "%Y-%m-%d") |
| 148 | + else: |
| 149 | + date = val # Assume it's already a datetime object |
| 150 | + node_attrs["createdAt_year"] = date.year |
| 151 | + except (ValueError, TypeError) as e: |
| 152 | + print(f"Error processing date for {repo_name}: {e}") |
| 153 | + # If date parsing fails, use default value |
| 154 | + node_attrs["createdAt_year"] = 0 |
| 155 | + else: |
| 156 | + node_attrs["createdAt_year"] = 0 |
| 157 | + elif col == "topics": |
| 158 | + # Store topics as a comma-separated string |
| 159 | + node_attrs[col] = val if val else default_values[col] |
| 160 | + else: |
| 161 | + # Use default value if the value is None |
| 162 | + node_attrs[col] = default_values[col] if val is None else val |
| 163 | + G.add_node(repo_name, **node_attrs) |
| 164 | + |
| 165 | + # Print some statistics about the years |
| 166 | + years = [attrs.get("createdAt_year", 0) for _, attrs in G.nodes(data=True)] |
| 167 | + # print(f"Date statistics:") |
| 168 | + # print(f"Years range: {min(years)} to {max(years)}") |
| 169 | + # print(f"Number of nodes with year=0: {years.count(0)}") |
| 170 | + |
| 171 | + nx.write_gexf(G, gexf_path) |
| 172 | + return gexf_path # Return the unique file path |
0 commit comments