66from typing import Dict , List , Optional
77from populate_discussion_helpers import GitHubAuthManager , GraphQLHelper
88from populate_discussion import (
9- load_json , decode_html_entities , get_category_id , POPULAR_TAG_NAME
9+ load_json , decode_html_entities , get_category_id , POPULAR_TAG_NAME , get_tags_under_threshold
1010)
1111import duckdb as dd
1212
@@ -64,12 +64,13 @@ def replace_html_image(match):
6464 return text
6565
6666class MigrationValidator :
67- def __init__ (self , auth_manager : GitHubAuthManager , owner : str , name : str , category_name : str , ignored_tags : Optional [List [str ]] = None , popular_tag_min_threshold : int = 200 ):
67+ def __init__ (self , auth_manager : GitHubAuthManager , owner : str , name : str , category_name : str , ignored_tags : Optional [List [str ]] = None , popular_tag_min_threshold : int = 200 , tag_min_threshold : int = 1 ):
6868 self .owner = owner
6969 self .name = name
7070 self .category_name = category_name
7171 self .ignored_tags = ignored_tags or []
7272 self .popular_tag_min_threshold = popular_tag_min_threshold
73+ self .tag_min_threshold = tag_min_threshold
7374 self .github_graphql = GraphQLHelper (auth_manager )
7475 self .popular_gh_questions = set ()
7576 self .validation_results = {
@@ -157,7 +158,7 @@ def get_github_discussions(self) -> List[Dict]:
157158
158159 return discussions
159160
160- def validate_question_content (self , so_question : Dict , gh_discussion : Dict ) -> List [str ]:
161+ def validate_question_content (self , so_question : Dict , gh_discussion : Dict , tags_under_min_threshold : Optional [ List [ str ]] = None ) -> List [str ]:
161162 """Validate that question content was transferred correctly."""
162163 issues = []
163164
@@ -203,6 +204,8 @@ def validate_question_content(self, so_question: Dict, gh_discussion: Dict) -> L
203204
204205 # Check tags/labels
205206 so_tags = set (so_question .get ('tags' , []))
207+ if tags_under_min_threshold :
208+ so_tags = so_tags - set (tags_under_min_threshold )
206209 gh_labels = set (label ['name' ] for label in gh_discussion ['labels' ]['nodes' ])
207210 missing_tags = so_tags - gh_labels
208211 if missing_tags :
@@ -335,7 +338,7 @@ def validate_comments(self, so_question: Dict, gh_discussion: Dict) -> List[str]
335338
336339 return issues
337340
338- def process_question (self , so_question : Dict , gh_discussions_by_title : Dict ) -> None :
341+ def process_question (self , so_question : Dict , gh_discussions_by_title : Dict , tags_under_min_threshold : Optional [ List [ str ]] = None ) -> None :
339342 """
340343 Process a single SO question and update validation results.
341344
@@ -350,7 +353,7 @@ def process_question(self, so_question: Dict, gh_discussions_by_title: Dict) ->
350353 gh_discussion = gh_discussions_by_title [so_title ]
351354
352355 # Validate content
353- content_issues = self .validate_question_content (so_question , gh_discussion )
356+ content_issues = self .validate_question_content (so_question , gh_discussion , tags_under_min_threshold )
354357 if content_issues :
355358 self .validation_results ['content_issues' ].append ({
356359 'id' : so_question ['question_id' ],
@@ -400,49 +403,45 @@ def validate_popular_tags(self, questions_file: str) -> None:
400403 if tagged_as_popular_but_are_not :
401404 self .validation_results ['popular_question_issues' ]['tagged_as_popular_but_are_not' ] = tagged_as_popular_but_are_not
402405
403- def validate_migration (self , questions_file : str ) -> Dict :
406+ def validate_migration (self , questions_file : str , tags_file : str ) -> Dict :
404407 """Main validation method."""
405408 logger .info ("Starting migration validation..." )
406409
407410 # Load Stack Overflow data
408411 so_questions = load_json (questions_file )
412+ tags = load_json (tags_file )
409413 self .validation_results ['total_questions' ] = len (so_questions )
410414
411415 # Get GitHub discussions
412416 gh_discussions = self .get_github_discussions ()
413417 gh_discussions_by_title = {d ['title' ]: d for d in gh_discussions }
414-
418+
419+ # Get tags that were omitted because of low usage
420+ tags_under_min_threshold = get_tags_under_threshold (self .tag_min_threshold , tags )
421+
415422 logger .info (f"Found { len (so_questions )} SO questions and { len (gh_discussions )} GH discussions" )
416-
423+
424+ logger .info ("Starting question verification..." )
417425 for so_question in so_questions :
418- self .process_question (so_question , gh_discussions_by_title )
426+ self .process_question (so_question , gh_discussions_by_title , tags_under_min_threshold )
419427
420428 self .validate_popular_tags (questions_file )
421429 return self .validation_results
422430
423- def calculate_success_rate (self ) -> float :
424- results = self .validation_results
425- if results ['total_questions' ] == 0 :
426- return 0.0
427-
428- successful_questions = results ['migrated_questions' ] + len (results ['ignored_questions' ])
429- return (successful_questions / results ['total_questions' ]) * 100
431+
430432
431433 def generate_report (self ) -> str :
432434 """Generate a validation report."""
433435 results = self .validation_results
434- success_rate = self .calculate_success_rate ()
435-
436436 report = f"""
437437# Migration Validation Report
438438
439- Success rate is determined by the formula: (migrated questions + ignored questions) / total questions * 100
440439
441440## Summary
442441- Total SO Questions: { results ['total_questions' ]}
443442- Total Migrated Questions: { results ['migrated_questions' ]}
444443- Ignored Questions: { len (results ['ignored_questions' ])}
445- - Success Rate : { success_rate :.1f } %
444+ - Migrated + Ignored Questions : { results [ 'migrated_questions' ] + len ( results [ 'ignored_questions' ]) }
446445- Missing Questions: { len (results ['missing_questions' ])}
447446- Content Issues: { len (results ['content_issues' ])}
448447- Answer Mismatches: { len (results ['answer_mismatches' ])}
@@ -515,16 +514,21 @@ def main():
515514 parser .add_argument ('--category' , required = True , help = 'Discussion category name' )
516515 parser .add_argument ('--questions-file' , default = 'questions_answers_comments.json' ,
517516 help = 'Path to questions JSON file' )
517+ parser .add_argument ('--tags-file' , default = 'tags.json' , help = 'Path to tags JSON file' )
518518 parser .add_argument ('--output' , default = 'validation_report.md' ,
519519 help = 'Output file for validation report' )
520520 parser .add_argument ('--ignore-tags' ,
521521 type = str ,
522522 nargs = '+' ,
523523 help = 'List of tags that were ignored in the migration process (space-separated). Questions that were tagged with these tag(s) were not migrated.' )
524+ parser .add_argument ('--tag-min-threshold' ,
525+ type = int ,
526+ default = 1 ,
527+ help = 'The value used in the migration process to determine if a tag would be migrated. (default=1)' )
524528 parser .add_argument ('--popular-tag-min-threshold' ,
525529 required = True ,
526530 type = int ,
527- help = 'The value used in the migration process to determine popular tags .' )
531+ help = 'The value used in the migration process to determine popular questions .' )
528532
529533 args = parser .parse_args ()
530534
@@ -542,10 +546,10 @@ def main():
542546 github_auth_manager .initialize ()
543547
544548 # Create validator
545- validator = MigrationValidator (github_auth_manager , owner , name , args .category , args .ignore_tags , args .popular_tag_min_threshold )
549+ validator = MigrationValidator (github_auth_manager , owner , name , args .category , args .ignore_tags , args .popular_tag_min_threshold , args . tag_min_threshold )
546550
547551 # Run validation
548- validator .validate_migration (args .questions_file )
552+ validator .validate_migration (args .questions_file , args . tags_file )
549553
550554 # Generate and save report
551555 report = validator .generate_report ()
0 commit comments