3434
3535import urllib
3636
37+
3738## Markdown to plain text conversion: begin ##
3839# code snippet from https://stackoverflow.com/a/54923798
3940def unmark_element (element , stream = None ):
@@ -226,7 +227,7 @@ def get_path(obj, path):
226227 license_info = {}
227228 if 'license' in filtered_resp :
228229 for k in ('name' , 'url' ):
229- if k in filtered_resp ['license' ]:
230+ if k in filtered_resp ['license' ]:
230231 license_info [k ] = filtered_resp ['license' ][k ]
231232
232233 ## If we didn't find it, look for the license
@@ -240,7 +241,8 @@ def get_path(obj, path):
240241 # license_text = license_text_resp.text
241242 license_info ['url' ] = possible_license_url
242243
243- filtered_resp ['license' ] = license_info
244+ if license_info != '' :
245+ filtered_resp ['license' ] = license_info
244246
245247 # get keywords / topics
246248 topics_headers = header
@@ -310,7 +312,7 @@ def get_path(obj, path):
310312 zip_ref .extractall (repo_extract_dir )
311313
312314 repo_folders = os .listdir (repo_extract_dir )
313- assert (len (repo_folders ) == 1 )
315+ assert (len (repo_folders ) == 1 )
314316
315317 repo_dir = os .path .join (repo_extract_dir , repo_folders [0 ])
316318
@@ -334,7 +336,8 @@ def get_path(obj, path):
334336 else :
335337 docs_path = repo_relative_path + "/" + dirname
336338
337- docs .append (f"https://github.com/{ owner } /{ repo_name } /tree/{ urllib .parse .quote (repo_ref )} /{ docs_path } " )
339+ docs .append (
340+ f"https://github.com/{ owner } /{ repo_name } /tree/{ urllib .parse .quote (repo_ref )} /{ docs_path } " )
338341 print (docs )
339342
340343 print ("NOTEBOOKS:" )
@@ -343,14 +346,12 @@ def get_path(obj, path):
343346 print ("DOCKERFILES:" )
344347 print (dockerfiles )
345348
346- def convert_to_raw_usercontent (partial ):
347-
348- return f"https://raw.githubusercontent.com/{ owner } /{ repo_name } /{ repo_ref } /{ urllib .parse .quote (partial )} "
349-
350-
351- filtered_resp ["hasExecutableNotebook" ] = [convert_to_raw_usercontent (x ) for x in notebooks ]
352- filtered_resp ["hasBuildFile" ] = [convert_to_raw_usercontent (x ) for x in dockerfiles ]
353- filtered_resp ["hasDocumentation" ] = docs
349+ if len (notebooks ) > 0 :
350+ filtered_resp ["hasExecutableNotebook" ] = [convert_to_raw_usercontent (x , owner , repo_name , repo_ref ) for x in notebooks ]
351+ if len (dockerfiles ) > 0 :
352+ filtered_resp ["hasBuildFile" ] = [convert_to_raw_usercontent (x , owner , repo_name , repo_ref ) for x in dockerfiles ]
353+ if len (docs ) > 0 :
354+ filtered_resp ["hasDocumentation" ] = docs
354355
355356 ## get releases
356357 releases_list , date = rate_limit_get (repo_api_base_url + "/releases" ,
@@ -365,6 +366,9 @@ def convert_to_raw_usercontent(partial):
365366 return text , filtered_resp
366367
367368
369+ def convert_to_raw_usercontent (partial , owner , repo_name , repo_ref ):
370+ return f"https://raw.githubusercontent.com/{ owner } /{ repo_name } /{ repo_ref } /{ urllib .parse .quote (partial )} "
371+
368372## Function takes readme text as input and divides it into excerpts
369373## Returns the extracted excerpts
370374def create_excerpts (string_list ):
@@ -558,7 +562,7 @@ def merge(header_predictions, predictions, citations, dois, binder_links, long_t
558562 print ("Merge prediction using header information, classifier and bibtex and doi parsers" )
559563 if long_title :
560564 predictions ['long_title' ] = {'excerpt' : long_title , 'confidence' : [1.0 ],
561- 'technique' : 'Regular expression' }
565+ 'technique' : 'Regular expression' }
562566 for i in range (len (citations )):
563567 if 'citation' not in predictions .keys ():
564568 predictions ['citation' ] = []
@@ -575,7 +579,7 @@ def merge(header_predictions, predictions, citations, dois, binder_links, long_t
575579 for notebook in binder_links :
576580 # The identifier is in position 1. Position 0 is the badge id, which we don't want to export
577581 predictions ['executable_example' ].insert (0 , {'excerpt' : notebook [1 ], 'confidence' : [1.0 ],
578- 'technique' : 'Regular expression' })
582+ 'technique' : 'Regular expression' })
579583 for headers in header_predictions :
580584 if headers not in predictions .keys ():
581585 predictions [headers ] = header_predictions [headers ]
@@ -596,7 +600,10 @@ def format_output(git_data, repo_data):
596600 repo_data ['description' ] = []
597601 repo_data ['description' ].append ({'excerpt' : git_data [i ], 'confidence' : [1.0 ], 'technique' : 'GitHub API' })
598602 else :
599- repo_data [i ] = {'excerpt' : git_data [i ], 'confidence' : [1.0 ], 'technique' : 'GitHub API' }
603+ if i == 'hasExecutableNotebook' or i == 'hasBuildFile' or i == 'hasDocumentation' :
604+ repo_data [i ] = {'excerpt' : git_data [i ], 'confidence' : [1.0 ], 'technique' : 'File Exploration' }
605+ else :
606+ repo_data [i ] = {'excerpt' : git_data [i ], 'confidence' : [1.0 ], 'technique' : 'GitHub API' }
600607
601608 return repo_data
602609
@@ -618,16 +625,15 @@ def save_json(git_data, repo_data, outfile):
618625 repo_data = format_output (git_data , repo_data )
619626 save_json_output (repo_data , outfile )
620627
621- def save_codemeta_output (repo_data , outfile , pretty = False ):
622628
629+ def save_codemeta_output (repo_data , outfile , pretty = False ):
623630 def data_path (path ):
624631 return DataGraph .resolve_path (repo_data , path )
625632
626633 def format_date (date_string ):
627634 date_object = date_parser .parse (date_string )
628635 return date_object .strftime ("%Y-%m-%d" )
629636
630-
631637 latest_release = None
632638 releases = data_path (["releases" , "excerpt" ])
633639
@@ -659,9 +665,9 @@ def average_confidence(x):
659665 else :
660666 return 0
661667
662-
663668 descriptions = data_path (["description" ])
664- descriptions .sort (key = lambda x : (average_confidence (x ) + (1 if x ["technique" ] == "GitHub API" else 0 )), reverse = True )
669+ descriptions .sort (key = lambda x : (average_confidence (x ) + (1 if x ["technique" ] == "GitHub API" else 0 )),
670+ reverse = True )
665671 descriptions_text = [x ["excerpt" ] for x in descriptions ]
666672
667673 codemeta_output = {
@@ -792,4 +798,4 @@ def run_cli(*,
792798 out_file .write (data_graph .g .serialize (format = graph_format ))
793799
794800 if codemeta_out is not None :
795- save_codemeta_output (repo_data , codemeta_out , pretty = pretty )
801+ save_codemeta_output (repo_data , codemeta_out , pretty = pretty )
0 commit comments