Skip to content

Commit 8b3fc70

Browse files
MonicaGCopilot
andauthored
Handle low usage tags (#28)
* Added support for tags under min threshold and cleaned up type issues * Update utils/stackoverflow/populate_discussion.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update utils/stackoverflow/so_explore.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 3b963eb commit 8b3fc70

File tree

5 files changed

+121
-33
lines changed

5 files changed

+121
-33
lines changed

utils/stackoverflow/populate_discussion.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def get_labels(repo):
6464
"""Get all labels in the repository"""
6565
return {label.name: label for label in repo.get_labels()}
6666

67-
def create_label(repo, name: str, description: str = None):
67+
def create_label(repo, name: str, description: Optional[str] = None):
6868
"""Create a new label in the repository, ensuring description is < 100 chars"""
6969
if description and len(description) > 100:
7070
description = description[:97] + '...'
@@ -472,7 +472,10 @@ def get_tags_under_threshold(min_threshold: int, tags_data: List[Dict[str, Any]]
472472
Returns:
473473
List of tag names (strings) for tags with count < min_threshold
474474
"""
475-
return [tag['name'] for tag in tags_data if tag.get('count', 0) < min_threshold]
475+
logger.info(f"Identifying tags with count below threshold of {min_threshold}")
476+
low_count_tags = [tag['name'] for tag in tags_data if tag.get('count', 0) < min_threshold]
477+
logger.info(f"Found {len(low_count_tags)} tags below threshold: {low_count_tags}")
478+
return low_count_tags
476479

477480
def get_tags_at_or_above_threshold(min_threshold: int, tags_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
478481
"""Get tag objects for tags with count at or above the minimum threshold.
@@ -682,14 +685,19 @@ def main():
682685
github_auth_manager = GitHubAuthManager()
683686
github_auth_manager.initialize()
684687

688+
github_client = github_auth_manager.get_client()
689+
if github_client is None:
690+
logger.error("GitHub client is not initialized. Please check your authentication setup.")
691+
raise RuntimeError("GitHub client is not initialized.")
692+
685693
github_graphql = GraphQLHelper(github_auth_manager, rate_limiter)
686694

687695
repo_parts = args.repo.split('/')
688696
if len(repo_parts) != 2:
689697
raise ValueError("Repository must be in format 'owner/name'")
690698

691699
owner, name = repo_parts
692-
repo = github_auth_manager.get_client().get_repo(f"{owner}/{name}")
700+
repo = github_client.get_repo(f"{owner}/{name}")
693701

694702
logger.info(f"Using repo '{repo.full_name}'")
695703

@@ -879,7 +887,7 @@ def main():
879887
class TagsToIgnore:
880888
"""Helper class to manage tags that should be ignored during migration."""
881889

882-
def __init__(self, tags_to_ignore: list[str] = None):
890+
def __init__(self, tags_to_ignore: Optional[List[str]] = None):
883891
"""Initialize tags to ignore.
884892
885893
Args:
@@ -900,7 +908,7 @@ def should_ignore(self, tags: list[str]) -> bool:
900908
True if any tag should be ignored, False otherwise
901909
"""
902910

903-
return self.tags_to_ignore and any(t in self.tags_to_ignore for t in tags)
911+
return bool(self.tags_to_ignore) and any(t in self.tags_to_ignore for t in tags)
904912

905913
if __name__ == '__main__':
906914
main()

utils/stackoverflow/populate_discussion_helpers.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import time
88
from github import Github, Auth
99
import requests
10+
from typing import Optional
1011

1112
# Setup logging
1213
logger = logging.getLogger(__name__)
@@ -87,7 +88,9 @@ def get_token(self):
8788
if not self._initialized:
8889
raise Exception("GitHub auth not initialized. Call initialize() first.")
8990

90-
return self._github_auth.token
91+
if self._github_auth is None:
92+
raise Exception("GitHub authentication has not been initialized or failed.")
93+
return self._github_auth.token
9194

9295
def get_client(self):
9396
"""Get the current GitHub client."""
@@ -104,8 +107,7 @@ def is_initialized(self):
104107

105108
class GraphQLHelper:
106109
"""Helper class for GraphQL operations."""
107-
108-
def __init__(self, github_auth_manager: GitHubAuthManager, rate_limiter: RateLimiter = None):
110+
def __init__(self, github_auth_manager: GitHubAuthManager, rate_limiter: Optional[RateLimiter] = None):
109111
"""Initialize with GitHub auth manager and optional rate limiter.
110112
111113
Args:

utils/stackoverflow/so_explore.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
# dd.sql("select question_id, view_count from './questions_answers_comments.json' as questions where view_count >= 100 order by view_count desc").show()
2424
# dd.sql("select questions.answers.comments.link, questions.answers.link, questions.answers.share_link, questions.link, questions.share_link from './questions_answers_comments.json' as questions").show()
2525
# dd.sql("select unnest(questions.comments).link, unnest(questions.answers).share_link, unnest(questions.answers).link, unnest(questions.answers.comments).link, questions.link, questions.share_link from './questions_answers_comments.json' as questions").show()
26+
dd.sql("select tags.name, tags.count from './tags.json' as tags where tags.count < 5").show()
2627

2728
# This gets all links and share_links from answers, questions and comments and outputs them to a file. User links are ignored
2829
#dd.sql("Copy (" \

utils/stackoverflow/test_validate_migration.py

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,47 @@ def test_validate_question_content_missing_tags(self):
173173
self.assertIn("networking", issues[0])
174174
self.assertIn("troubleshooting", issues[0])
175175

176+
def test_validate_question_content_has_low_usage_tag(self):
177+
"""Test validation when a low usage tag is present."""
178+
so_question = {
179+
"title": "VPN Help",
180+
"body": "Need VPN help.",
181+
"tags": ["vpn", "networking", "troubleshooting"]
182+
}
183+
184+
gh_discussion = {
185+
"title": "VPN Help",
186+
"body": "Need VPN help.",
187+
"labels": {"nodes": [{"name": "vpn"}, {"name": "troubleshooting"}]}
188+
}
189+
190+
mock_low_usage_tags_data = ["networking", "help"]
191+
192+
issues = self.validator.validate_question_content(so_question, gh_discussion, mock_low_usage_tags_data)
193+
self.assertEqual(len(issues), 0)
194+
195+
def test_validate_question_content_missing_mixed_missing_and_low_usage_tag(self):
196+
"""Test validation when a low usage tag is present."""
197+
so_question = {
198+
"title": "VPN Help",
199+
"body": "Need VPN help.",
200+
"tags": ["vpn", "networking", "troubleshooting"]
201+
}
202+
203+
gh_discussion = {
204+
"title": "VPN Help",
205+
"body": "Need VPN help.",
206+
"labels": {"nodes": [{"name": "vpn"}]}
207+
}
208+
209+
mock_low_usage_tags_data = ["networking", "help"]
210+
211+
issues = self.validator.validate_question_content(so_question, gh_discussion, mock_low_usage_tags_data)
212+
self.assertEqual(len(issues), 1)
213+
self.assertIn("Missing tags:", issues[0])
214+
# Check that both missing tags are mentioned, regardless of order
215+
self.assertIn("troubleshooting", issues[0])
216+
176217
def test_validate_question_content_with_images_matching(self):
177218
"""Test validation when images are present and match between SO and GH."""
178219
so_question = {
@@ -1069,12 +1110,44 @@ def test_validate_migration_with_ignored_tags_integration(self, mock_load_json):
10691110
"tags": ["current", "deprecated"]
10701111
}
10711112
]
1072-
mock_load_json.return_value = mock_so_questions
1113+
1114+
# Mock the tags data (second file parameter)
1115+
mock_tags_data = [
1116+
{
1117+
"count": 145,
1118+
"name": "openshift"
1119+
},
1120+
{
1121+
"count": 32,
1122+
"name": "keycloak"
1123+
},
1124+
{
1125+
"count": 26,
1126+
"name": "security"
1127+
},
1128+
{
1129+
"count": 15,
1130+
"name": "deprecated"
1131+
},
1132+
{
1133+
"count": 10,
1134+
"name": "legacy"
1135+
}
1136+
]
1137+
1138+
# Configure mock to return different data based on which file is being loaded
1139+
def mock_load_json_side_effect(file_path):
1140+
if "mock_tag_file.json" in file_path:
1141+
return mock_tags_data
1142+
else:
1143+
return mock_so_questions
1144+
1145+
mock_load_json.side_effect = mock_load_json_side_effect
10731146

10741147
# Mock get_github_discussions to return empty (none migrated)
10751148
validator.get_github_discussions = Mock(return_value=[])
10761149

1077-
results = validator.validate_migration("dummy_file.json")
1150+
results = validator.validate_migration("mock_so_file.json", "mock_tag_file.json")
10781151

10791152
# Check results
10801153
self.assertEqual(results['total_questions'], 4)

utils/stackoverflow/validate_migration.py

Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from typing import Dict, List, Optional
77
from populate_discussion_helpers import GitHubAuthManager, GraphQLHelper
88
from populate_discussion import (
9-
load_json, decode_html_entities, get_category_id, POPULAR_TAG_NAME
9+
load_json, decode_html_entities, get_category_id, POPULAR_TAG_NAME, get_tags_under_threshold
1010
)
1111
import duckdb as dd
1212

@@ -64,12 +64,13 @@ def replace_html_image(match):
6464
return text
6565

6666
class MigrationValidator:
67-
def __init__(self, auth_manager: GitHubAuthManager, owner: str, name: str, category_name: str, ignored_tags: Optional[List[str]] = None, popular_tag_min_threshold: int = 200):
67+
def __init__(self, auth_manager: GitHubAuthManager, owner: str, name: str, category_name: str, ignored_tags: Optional[List[str]] = None, popular_tag_min_threshold: int = 200, tag_min_threshold: int = 1):
6868
self.owner = owner
6969
self.name = name
7070
self.category_name = category_name
7171
self.ignored_tags = ignored_tags or []
7272
self.popular_tag_min_threshold = popular_tag_min_threshold
73+
self.tag_min_threshold = tag_min_threshold
7374
self.github_graphql = GraphQLHelper(auth_manager)
7475
self.popular_gh_questions = set()
7576
self.validation_results = {
@@ -157,7 +158,7 @@ def get_github_discussions(self) -> List[Dict]:
157158

158159
return discussions
159160

160-
def validate_question_content(self, so_question: Dict, gh_discussion: Dict) -> List[str]:
161+
def validate_question_content(self, so_question: Dict, gh_discussion: Dict, tags_under_min_threshold: Optional[List[str]] = None) -> List[str]:
161162
"""Validate that question content was transferred correctly."""
162163
issues = []
163164

@@ -203,6 +204,8 @@ def validate_question_content(self, so_question: Dict, gh_discussion: Dict) -> L
203204

204205
# Check tags/labels
205206
so_tags = set(so_question.get('tags', []))
207+
if tags_under_min_threshold:
208+
so_tags = so_tags - set(tags_under_min_threshold)
206209
gh_labels = set(label['name'] for label in gh_discussion['labels']['nodes'])
207210
missing_tags = so_tags - gh_labels
208211
if missing_tags:
@@ -335,7 +338,7 @@ def validate_comments(self, so_question: Dict, gh_discussion: Dict) -> List[str]
335338

336339
return issues
337340

338-
def process_question(self, so_question: Dict, gh_discussions_by_title: Dict) -> None:
341+
def process_question(self, so_question: Dict, gh_discussions_by_title: Dict, tags_under_min_threshold: Optional[List[str]] = None) -> None:
339342
"""
340343
Process a single SO question and update validation results.
341344
@@ -350,7 +353,7 @@ def process_question(self, so_question: Dict, gh_discussions_by_title: Dict) ->
350353
gh_discussion = gh_discussions_by_title[so_title]
351354

352355
# Validate content
353-
content_issues = self.validate_question_content(so_question, gh_discussion)
356+
content_issues = self.validate_question_content(so_question, gh_discussion, tags_under_min_threshold)
354357
if content_issues:
355358
self.validation_results['content_issues'].append({
356359
'id': so_question['question_id'],
@@ -400,49 +403,45 @@ def validate_popular_tags(self, questions_file: str) -> None:
400403
if tagged_as_popular_but_are_not:
401404
self.validation_results['popular_question_issues']['tagged_as_popular_but_are_not'] = tagged_as_popular_but_are_not
402405

403-
def validate_migration(self, questions_file: str) -> Dict:
406+
def validate_migration(self, questions_file: str, tags_file: str) -> Dict:
404407
"""Main validation method."""
405408
logger.info("Starting migration validation...")
406409

407410
# Load Stack Overflow data
408411
so_questions = load_json(questions_file)
412+
tags = load_json(tags_file)
409413
self.validation_results['total_questions'] = len(so_questions)
410414

411415
# Get GitHub discussions
412416
gh_discussions = self.get_github_discussions()
413417
gh_discussions_by_title = {d['title']: d for d in gh_discussions}
414-
418+
419+
# Get tags that were omitted because of low usage
420+
tags_under_min_threshold = get_tags_under_threshold(self.tag_min_threshold, tags)
421+
415422
logger.info(f"Found {len(so_questions)} SO questions and {len(gh_discussions)} GH discussions")
416-
423+
424+
logger.info("Starting question verification...")
417425
for so_question in so_questions:
418-
self.process_question(so_question, gh_discussions_by_title)
426+
self.process_question(so_question, gh_discussions_by_title, tags_under_min_threshold)
419427

420428
self.validate_popular_tags(questions_file)
421429
return self.validation_results
422430

423-
def calculate_success_rate(self) -> float:
424-
results = self.validation_results
425-
if results['total_questions'] == 0:
426-
return 0.0
427-
428-
successful_questions = results['migrated_questions'] + len(results['ignored_questions'])
429-
return (successful_questions / results['total_questions']) * 100
431+
430432

431433
def generate_report(self) -> str:
432434
"""Generate a validation report."""
433435
results = self.validation_results
434-
success_rate = self.calculate_success_rate()
435-
436436
report = f"""
437437
# Migration Validation Report
438438
439-
Success rate is determined by the formula: (migrated questions + ignored questions) / total questions * 100
440439
441440
## Summary
442441
- Total SO Questions: {results['total_questions']}
443442
- Total Migrated Questions: {results['migrated_questions']}
444443
- Ignored Questions: {len(results['ignored_questions'])}
445-
- Success Rate: {success_rate:.1f}%
444+
- Migrated + Ignored Questions: {results['migrated_questions'] + len(results['ignored_questions'])}
446445
- Missing Questions: {len(results['missing_questions'])}
447446
- Content Issues: {len(results['content_issues'])}
448447
- Answer Mismatches: {len(results['answer_mismatches'])}
@@ -515,16 +514,21 @@ def main():
515514
parser.add_argument('--category', required=True, help='Discussion category name')
516515
parser.add_argument('--questions-file', default='questions_answers_comments.json',
517516
help='Path to questions JSON file')
517+
parser.add_argument('--tags-file', default='tags.json', help='Path to tags JSON file')
518518
parser.add_argument('--output', default='validation_report.md',
519519
help='Output file for validation report')
520520
parser.add_argument('--ignore-tags',
521521
type=str,
522522
nargs='+',
523523
help='List of tags that were ignored in the migration process (space-separated). Questions that were tagged with these tag(s) were not migrated.')
524+
parser.add_argument('--tag-min-threshold',
525+
type=int,
526+
default=1,
527+
help='The value used in the migration process to determine if a tag would be migrated. (default=1)')
524528
parser.add_argument('--popular-tag-min-threshold',
525529
required=True,
526530
type=int,
527-
help='The value used in the migration process to determine popular tags.')
531+
help='The value used in the migration process to determine popular questions.')
528532

529533
args = parser.parse_args()
530534

@@ -542,10 +546,10 @@ def main():
542546
github_auth_manager.initialize()
543547

544548
# Create validator
545-
validator = MigrationValidator(github_auth_manager, owner, name, args.category, args.ignore_tags, args.popular_tag_min_threshold)
549+
validator = MigrationValidator(github_auth_manager, owner, name, args.category, args.ignore_tags, args.popular_tag_min_threshold, args.tag_min_threshold)
546550

547551
# Run validation
548-
validator.validate_migration(args.questions_file)
552+
validator.validate_migration(args.questions_file, args.tags_file)
549553

550554
# Generate and save report
551555
report = validator.generate_report()

0 commit comments

Comments
 (0)