Skip to content

Commit 0dfb979

Browse files
jor2Jordan-Williams2daniel-butler-irl
authored
feat: add submodule description (#34)
* feat: add submodule description * fix: some readmes wrong --------- Co-authored-by: Jordan-Williams2 <[email protected]> Co-authored-by: Daniel Butler <[email protected]>
1 parent e9dc2ce commit 0dfb979

File tree

4 files changed

+748
-133
lines changed

4 files changed

+748
-133
lines changed

scripts/generate_module_index.py

Lines changed: 208 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -213,35 +213,184 @@ def parse_iso_date(date_str: str) -> Optional[datetime]:
213213
return None
214214

215215

216+
async def fetch_submodule_description(
217+
gh_client: GitHubClient,
218+
owner: str,
219+
repo: str,
220+
submodule_path: str
221+
) -> str:
222+
"""
223+
Extract a brief description from a submodule's README.
224+
225+
Args:
226+
gh_client: GitHubClient instance
227+
owner: Repository owner
228+
repo: Repository name
229+
submodule_path: Path to the submodule directory
230+
231+
Returns:
232+
Brief description extracted from the submodule's README
233+
"""
234+
try:
235+
# Try to fetch the submodule's README
236+
readme_path = f"{submodule_path}/README.md"
237+
readme_data = await gh_client.get_file_content(owner, repo, readme_path)
238+
content = readme_data.get("decoded_content", "")
239+
240+
if not content:
241+
return ""
242+
243+
# Extract first meaningful paragraph (simple extraction)
244+
paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]
245+
246+
in_html_comment = False
247+
for para in paragraphs:
248+
# Track HTML comment state
249+
if "<!--" in para:
250+
in_html_comment = True
251+
if "-->" in para:
252+
in_html_comment = False
253+
continue # Skip the closing comment line
254+
255+
# Skip if we're inside a comment
256+
if in_html_comment:
257+
continue
258+
259+
# Skip headers
260+
if para.startswith("#"):
261+
continue
262+
263+
# Skip code blocks
264+
if para.startswith("```"):
265+
continue
266+
267+
# Skip badges
268+
if "[![" in para[:50]:
269+
continue
270+
271+
# Skip markdown tables
272+
if para.startswith("|") or "|---" in para[:100]:
273+
continue
274+
275+
# Found a meaningful paragraph
276+
# Clean it up: remove inline markdown links and normalize whitespace
277+
clean_para = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", para)
278+
# Normalize whitespace (replace multiple spaces/newlines with single space)
279+
clean_para = " ".join(clean_para.split())
280+
281+
# Check if this paragraph ends with a colon (e.g., "following:")
282+
# and is followed by a bullet list
283+
if clean_para.rstrip().endswith(":"):
284+
# Find the next paragraph which should be the bullet list
285+
para_idx = paragraphs.index(para)
286+
if para_idx + 1 < len(paragraphs):
287+
next_para = paragraphs[para_idx + 1]
288+
# Extract bullet items
289+
bullet_items = []
290+
for line in next_para.split("\n"):
291+
line = line.strip()
292+
if line.startswith("-") or line.startswith("*"):
293+
# Remove bullet marker and clean
294+
item = line.lstrip("-*").strip()
295+
# Remove markdown links
296+
item = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", item)
297+
# Take just the first part before any explanation
298+
if ":" in item:
299+
item = item.split(":")[0].strip()
300+
# Remove any extra whitespace and newlines
301+
item = " ".join(item.split())
302+
# Remove any nested bullets or extra formatting
303+
if item and len(item) < 80:
304+
bullet_items.append(item)
305+
306+
if bullet_items:
307+
# Combine the intro text with comma-separated list and add "etc" at the end
308+
items_str = ", ".join(bullet_items[:5])
309+
if len(bullet_items) > 5:
310+
items_str += " etc"
311+
else:
312+
items_str += " etc"
313+
# Clean the intro text to remove newlines
314+
intro_clean = " ".join(clean_para.rstrip(":").split())
315+
description = intro_clean + ": " + items_str
316+
# Limit total length, truncating at word boundary
317+
if len(description) > 1200:
318+
# Find last space before the 1197 char limit
319+
truncated = description[:1197]
320+
last_space = truncated.rfind(' ')
321+
if last_space > 0:
322+
description = description[:last_space] + "..."
323+
else:
324+
description = description[:1197] + "..."
325+
return description
326+
327+
# Take first sentence or limit to ~150 chars
328+
sentences = clean_para.split(". ")
329+
if sentences:
330+
description = sentences[0]
331+
if not description.endswith("."):
332+
description += "."
333+
334+
# Limit length, truncating at word boundary
335+
if len(description) > 1200:
336+
# Find last space before the 1197 char limit
337+
truncated = description[:1197]
338+
last_space = truncated.rfind(' ')
339+
if last_space > 0:
340+
description = description[:last_space] + "..."
341+
else:
342+
description = description[:1197] + "..."
343+
344+
return description
345+
346+
return ""
347+
348+
except Exception as e:
349+
# Silently fail - it's okay if we can't get a description
350+
return ""
351+
352+
216353
async def fetch_submodules(
217354
tf_client: TerraformClient,
355+
gh_client: GitHubClient,
218356
namespace: str,
219357
name: str,
220358
provider: str,
221359
source: str
222360
) -> List[Dict[str, str]]:
223361
"""
224-
Fetch submodules for a Terraform module.
362+
Fetch submodules for a Terraform module with descriptions.
225363
226364
Args:
227365
tf_client: TerraformClient instance
366+
gh_client: GitHubClient instance
228367
namespace: Module namespace
229368
name: Module name
230369
provider: Module provider
231370
source: Module source URL
232371
233372
Returns:
234-
List of submodule dictionaries with path, name, and source_url
373+
List of submodule dictionaries with path, name, source_url, and description
235374
"""
236375
submodules = []
237376
try:
238377
module_details = await tf_client.get_module_details(
239378
namespace, name, provider, "latest"
240379
)
241380

381+
# Parse owner/repo from source URL for README fetching
382+
owner_repo = gh_client.parse_github_url(source)
383+
if not owner_repo:
384+
print(f"Warning: Could not parse GitHub URL: {source}")
385+
owner, repo = None, None
386+
else:
387+
owner, repo = owner_repo
388+
242389
# Extract submodules
243390
raw_submodules = module_details.get("submodules", [])
244-
for submodule in raw_submodules:
391+
392+
# Fetch all submodule descriptions in parallel
393+
async def fetch_single_submodule(submodule):
245394
submodule_path = submodule.get("path", "")
246395
submodule_name = submodule_path.split("/")[-1] if submodule_path else ""
247396

@@ -251,12 +400,29 @@ async def fetch_submodules(
251400
if submodule_path
252401
else source
253402
)
254-
255-
submodules.append({
403+
404+
# Fetch description from submodule's README
405+
description = ""
406+
if owner and repo and submodule_path:
407+
description = await fetch_submodule_description(
408+
gh_client, owner, repo, submodule_path
409+
)
410+
411+
return {
256412
"path": submodule_path,
257413
"name": submodule_name,
414+
"description": description,
258415
"source_url": submodule_source_url,
259-
})
416+
}
417+
418+
# Fetch all submodules in parallel
419+
submodules = await asyncio.gather(
420+
*[fetch_single_submodule(sub) for sub in raw_submodules],
421+
return_exceptions=True
422+
)
423+
424+
# Filter out any exceptions
425+
submodules = [sub for sub in submodules if isinstance(sub, dict)]
260426

261427
# Sort submodules by name
262428
submodules.sort(key=lambda x: x["name"])
@@ -503,8 +669,21 @@ async def extract_readme_excerpt(gh_client: GitHubClient, source: str) -> str:
503669

504670
# Second pass: If nothing found, get first meaningful paragraph
505671
if not readme_excerpt:
672+
in_html_comment = False
506673
for para in paragraphs:
507-
if should_skip_paragraph(para) or "<!--" in para:
674+
# Track HTML comment state
675+
if "<!--" in para:
676+
in_html_comment = True
677+
if "-->" in para:
678+
in_html_comment = False
679+
continue # Skip the closing comment line
680+
681+
# Skip if we're inside a comment
682+
if in_html_comment:
683+
continue
684+
685+
# Skip headers, code blocks, badges, and HTML comments
686+
if should_skip_paragraph(para) or para.startswith("#"):
508687
continue
509688

510689
# Found first real paragraph - use it
@@ -572,7 +751,7 @@ async def process_module(
572751

573752
# Fetch submodules for this module
574753
print(f"Fetching submodules for {module_id}...")
575-
submodules = await fetch_submodules(tf_client, namespace, name, provider, source)
754+
submodules = await fetch_submodules(tf_client, gh_client, namespace, name, provider, source)
576755

577756
# Fetch README excerpt
578757
readme_excerpt = await extract_readme_excerpt(gh_client, source)
@@ -638,12 +817,28 @@ async def generate_module_index():
638817
# Calculate cutoff date (3 months ago)
639818
cutoff_date = datetime.now(UTC) - timedelta(days=MODULE_AGE_THRESHOLD_DAYS)
640819

641-
# Process modules
820+
# Process modules in parallel (with concurrency limit to avoid overwhelming the API)
821+
print("Processing modules in parallel...")
822+
823+
# Process in batches to avoid overwhelming the API
824+
batch_size = 10
642825
filtered_modules = []
643-
for module in all_modules:
644-
processed_module = await process_module(module, tf_client, gh_client, cutoff_date)
645-
if processed_module:
646-
filtered_modules.append(processed_module)
826+
827+
for i in range(0, len(all_modules), batch_size):
828+
batch = all_modules[i:i + batch_size]
829+
batch_results = await asyncio.gather(
830+
*[process_module(module, tf_client, gh_client, cutoff_date) for module in batch],
831+
return_exceptions=True
832+
)
833+
834+
# Filter out None results and exceptions
835+
filtered_modules.extend([
836+
result for result in batch_results
837+
if result is not None and not isinstance(result, Exception)
838+
])
839+
840+
# Print progress
841+
print(f"Processed {min(i + batch_size, len(all_modules))}/{len(all_modules)} modules...")
647842

648843
# Sort by downloads (descending)
649844
filtered_modules.sort(key=lambda x: x["downloads"], reverse=True)

0 commit comments

Comments
 (0)