@@ -213,35 +213,184 @@ def parse_iso_date(date_str: str) -> Optional[datetime]:
213213        return  None 
214214
215215
216+ async  def  fetch_submodule_description (
217+     gh_client : GitHubClient ,
218+     owner : str ,
219+     repo : str ,
220+     submodule_path : str 
221+ ) ->  str :
222+     """ 
223+     Extract a brief description from a submodule's README. 
224+      
225+     Args: 
226+         gh_client: GitHubClient instance 
227+         owner: Repository owner 
228+         repo: Repository name 
229+         submodule_path: Path to the submodule directory 
230+          
231+     Returns: 
232+         Brief description extracted from the submodule's README 
233+     """ 
234+     try :
235+         # Try to fetch the submodule's README 
236+         readme_path  =  f"{ submodule_path }  /README.md" 
237+         readme_data  =  await  gh_client .get_file_content (owner , repo , readme_path )
238+         content  =  readme_data .get ("decoded_content" , "" )
239+         
240+         if  not  content :
241+             return  "" 
242+         
243+         # Extract first meaningful paragraph (simple extraction) 
244+         paragraphs  =  [p .strip () for  p  in  content .split ("\n \n " ) if  p .strip ()]
245+         
246+         in_html_comment  =  False 
247+         for  para  in  paragraphs :
248+             # Track HTML comment state 
249+             if  "<!--"  in  para :
250+                 in_html_comment  =  True 
251+             if  "-->"  in  para :
252+                 in_html_comment  =  False 
253+                 continue   # Skip the closing comment line 
254+             
255+             # Skip if we're inside a comment 
256+             if  in_html_comment :
257+                 continue 
258+             
259+             # Skip headers 
260+             if  para .startswith ("#" ):
261+                 continue 
262+             
263+             # Skip code blocks 
264+             if  para .startswith ("```" ):
265+                 continue 
266+                 
267+             # Skip badges 
268+             if  "[!["  in  para [:50 ]:
269+                 continue 
270+                 
271+             # Skip markdown tables 
272+             if  para .startswith ("|" ) or  "|---"  in  para [:100 ]:
273+                 continue 
274+             
275+             # Found a meaningful paragraph 
276+             # Clean it up: remove inline markdown links and normalize whitespace 
277+             clean_para  =  re .sub (r"\[([^\]]+)\]\([^\)]+\)" , r"\1" , para )
278+             # Normalize whitespace (replace multiple spaces/newlines with single space) 
279+             clean_para  =  " " .join (clean_para .split ())
280+             
281+             # Check if this paragraph ends with a colon (e.g., "following:") 
282+             # and is followed by a bullet list 
283+             if  clean_para .rstrip ().endswith (":" ):
284+                 # Find the next paragraph which should be the bullet list 
285+                 para_idx  =  paragraphs .index (para )
286+                 if  para_idx  +  1  <  len (paragraphs ):
287+                     next_para  =  paragraphs [para_idx  +  1 ]
288+                     # Extract bullet items 
289+                     bullet_items  =  []
290+                     for  line  in  next_para .split ("\n " ):
291+                         line  =  line .strip ()
292+                         if  line .startswith ("-" ) or  line .startswith ("*" ):
293+                             # Remove bullet marker and clean 
294+                             item  =  line .lstrip ("-*" ).strip ()
295+                             # Remove markdown links 
296+                             item  =  re .sub (r"\[([^\]]+)\]\([^\)]+\)" , r"\1" , item )
297+                             # Take just the first part before any explanation 
298+                             if  ":"  in  item :
299+                                 item  =  item .split (":" )[0 ].strip ()
300+                             # Remove any extra whitespace and newlines 
301+                             item  =  " " .join (item .split ())
302+                             # Remove any nested bullets or extra formatting 
303+                             if  item  and  len (item ) <  80 :
304+                                 bullet_items .append (item )
305+                     
306+                     if  bullet_items :
307+                         # Combine the intro text with comma-separated list and add "etc" at the end 
308+                         items_str  =  ", " .join (bullet_items [:5 ])
309+                         if  len (bullet_items ) >  5 :
310+                             items_str  +=  " etc" 
311+                         else :
312+                             items_str  +=  " etc" 
313+                         # Clean the intro text to remove newlines 
314+                         intro_clean  =  " " .join (clean_para .rstrip (":" ).split ())
315+                         description  =  intro_clean  +  ": "  +  items_str 
316+                         # Limit total length, truncating at word boundary 
317+                         if  len (description ) >  1200 :
318+                             # Find last space before the 1197 char limit 
319+                             truncated  =  description [:1197 ]
320+                             last_space  =  truncated .rfind (' ' )
321+                             if  last_space  >  0 :
322+                                 description  =  description [:last_space ] +  "..." 
323+                             else :
324+                                 description  =  description [:1197 ] +  "..." 
325+                         return  description 
326+             
327+             # Take first sentence or limit to ~150 chars 
328+             sentences  =  clean_para .split (". " )
329+             if  sentences :
330+                 description  =  sentences [0 ]
331+                 if  not  description .endswith ("." ):
332+                     description  +=  "." 
333+                 
334+                 # Limit length, truncating at word boundary 
335+                 if  len (description ) >  1200 :
336+                     # Find last space before the 1197 char limit 
337+                     truncated  =  description [:1197 ]
338+                     last_space  =  truncated .rfind (' ' )
339+                     if  last_space  >  0 :
340+                         description  =  description [:last_space ] +  "..." 
341+                     else :
342+                         description  =  description [:1197 ] +  "..." 
343+                     
344+                 return  description 
345+         
346+         return  "" 
347+         
348+     except  Exception  as  e :
349+         # Silently fail - it's okay if we can't get a description 
350+         return  "" 
351+ 
352+ 
216353async  def  fetch_submodules (
217354    tf_client : TerraformClient ,
355+     gh_client : GitHubClient ,
218356    namespace : str ,
219357    name : str ,
220358    provider : str ,
221359    source : str 
222360) ->  List [Dict [str , str ]]:
223361    """ 
224-     Fetch submodules for a Terraform module. 
362+     Fetch submodules for a Terraform module with descriptions . 
225363     
226364    Args: 
227365        tf_client: TerraformClient instance 
366+         gh_client: GitHubClient instance 
228367        namespace: Module namespace 
229368        name: Module name 
230369        provider: Module provider 
231370        source: Module source URL 
232371         
233372    Returns: 
234-         List of submodule dictionaries with path, name, and source_url  
373+         List of submodule dictionaries with path, name, source_url,  and description  
235374    """ 
236375    submodules  =  []
237376    try :
238377        module_details  =  await  tf_client .get_module_details (
239378            namespace , name , provider , "latest" 
240379        )
241380
381+         # Parse owner/repo from source URL for README fetching 
382+         owner_repo  =  gh_client .parse_github_url (source )
383+         if  not  owner_repo :
384+             print (f"Warning: Could not parse GitHub URL: { source }  " )
385+             owner , repo  =  None , None 
386+         else :
387+             owner , repo  =  owner_repo 
388+ 
242389        # Extract submodules 
243390        raw_submodules  =  module_details .get ("submodules" , [])
244-         for  submodule  in  raw_submodules :
391+         
392+         # Fetch all submodule descriptions in parallel 
393+         async  def  fetch_single_submodule (submodule ):
245394            submodule_path  =  submodule .get ("path" , "" )
246395            submodule_name  =  submodule_path .split ("/" )[- 1 ] if  submodule_path  else  "" 
247396
@@ -251,12 +400,29 @@ async def fetch_submodules(
251400                if  submodule_path 
252401                else  source 
253402            )
254- 
255-             submodules .append ({
403+             
404+             # Fetch description from submodule's README 
405+             description  =  "" 
406+             if  owner  and  repo  and  submodule_path :
407+                 description  =  await  fetch_submodule_description (
408+                     gh_client , owner , repo , submodule_path 
409+                 )
410+ 
411+             return  {
256412                "path" : submodule_path ,
257413                "name" : submodule_name ,
414+                 "description" : description ,
258415                "source_url" : submodule_source_url ,
259-             })
416+             }
417+         
418+         # Fetch all submodules in parallel 
419+         submodules  =  await  asyncio .gather (
420+             * [fetch_single_submodule (sub ) for  sub  in  raw_submodules ],
421+             return_exceptions = True 
422+         )
423+         
424+         # Filter out any exceptions 
425+         submodules  =  [sub  for  sub  in  submodules  if  isinstance (sub , dict )]
260426
261427        # Sort submodules by name 
262428        submodules .sort (key = lambda  x : x ["name" ])
@@ -503,8 +669,21 @@ async def extract_readme_excerpt(gh_client: GitHubClient, source: str) -> str:
503669
504670        # Second pass: If nothing found, get first meaningful paragraph 
505671        if  not  readme_excerpt :
672+             in_html_comment  =  False 
506673            for  para  in  paragraphs :
507-                 if  should_skip_paragraph (para ) or  "<!--"  in  para :
674+                 # Track HTML comment state 
675+                 if  "<!--"  in  para :
676+                     in_html_comment  =  True 
677+                 if  "-->"  in  para :
678+                     in_html_comment  =  False 
679+                     continue   # Skip the closing comment line 
680+                 
681+                 # Skip if we're inside a comment 
682+                 if  in_html_comment :
683+                     continue 
684+                 
685+                 # Skip headers, code blocks, badges, and HTML comments 
686+                 if  should_skip_paragraph (para ) or  para .startswith ("#" ):
508687                    continue 
509688
510689                # Found first real paragraph - use it 
@@ -572,7 +751,7 @@ async def process_module(
572751
573752    # Fetch submodules for this module 
574753    print (f"Fetching submodules for { module_id }  ..." )
575-     submodules  =  await  fetch_submodules (tf_client , namespace , name , provider , source )
754+     submodules  =  await  fetch_submodules (tf_client , gh_client ,  namespace , name , provider , source )
576755
577756    # Fetch README excerpt 
578757    readme_excerpt  =  await  extract_readme_excerpt (gh_client , source )
@@ -638,12 +817,28 @@ async def generate_module_index():
638817        # Calculate cutoff date (3 months ago) 
639818        cutoff_date  =  datetime .now (UTC ) -  timedelta (days = MODULE_AGE_THRESHOLD_DAYS )
640819
641-         # Process modules 
820+         # Process modules in parallel (with concurrency limit to avoid overwhelming the API) 
821+         print ("Processing modules in parallel..." )
822+         
823+         # Process in batches to avoid overwhelming the API 
824+         batch_size  =  10 
642825        filtered_modules  =  []
643-         for  module  in  all_modules :
644-             processed_module  =  await  process_module (module , tf_client , gh_client , cutoff_date )
645-             if  processed_module :
646-                 filtered_modules .append (processed_module )
826+         
827+         for  i  in  range (0 , len (all_modules ), batch_size ):
828+             batch  =  all_modules [i :i  +  batch_size ]
829+             batch_results  =  await  asyncio .gather (
830+                 * [process_module (module , tf_client , gh_client , cutoff_date ) for  module  in  batch ],
831+                 return_exceptions = True 
832+             )
833+             
834+             # Filter out None results and exceptions 
835+             filtered_modules .extend ([
836+                 result  for  result  in  batch_results  
837+                 if  result  is  not   None  and  not  isinstance (result , Exception )
838+             ])
839+             
840+             # Print progress 
841+             print (f"Processed { min (i  +  batch_size , len (all_modules ))}  /{ len (all_modules )}   modules..." )
647842
648843        # Sort by downloads (descending) 
649844        filtered_modules .sort (key = lambda  x : x ["downloads" ], reverse = True )
0 commit comments