Skip to content

Commit b352fe6

Browse files
authored
Implement Image and Infobox counting (#171)
This feature counts the Image and Infobox from the submitted article and saves it into the database.
2 parents d09fa8f + 4567e59 commit b352fe6

File tree

4 files changed

+149
-0
lines changed

4 files changed

+149
-0
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
"""Add image_count and infobox_count columns to submissions table
2+
3+
Revision ID: f1a2b3c4d5e6
4+
Revises: e4e56960f418
5+
6+
"""
7+
from alembic import op
8+
import sqlalchemy as sa
9+
from sqlalchemy import inspect
10+
11+
12+
revision = "f1a2b3c4d5e6"
13+
down_revision = "e4e56960f418"
14+
branch_labels = None
15+
depends_on = None
16+
17+
18+
def upgrade() -> None:
19+
conn = op.get_bind()
20+
inspector = inspect(conn)
21+
columns = [col["name"] for col in inspector.get_columns("submissions")]
22+
23+
if "image_count" not in columns:
24+
op.add_column(
25+
"submissions",
26+
sa.Column("image_count", sa.Integer(), nullable=True),
27+
)
28+
29+
if "infobox_count" not in columns:
30+
op.add_column(
31+
"submissions",
32+
sa.Column("infobox_count", sa.Integer(), nullable=True),
33+
)
34+
35+
36+
def downgrade() -> None:
37+
conn = op.get_bind()
38+
inspector = inspect(conn)
39+
columns = [col["name"] for col in inspector.get_columns("submissions")]
40+
41+
if "infobox_count" in columns:
42+
op.drop_column("submissions", "infobox_count")
43+
44+
if "image_count" in columns:
45+
op.drop_column("submissions", "image_count")

backend/app/models/submission.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,12 @@ class Submission(BaseModel):
7070
# Can be negative if article was reduced in size
7171
article_expansion_bytes = db.Column(db.Integer, nullable=True)
7272

73+
# Image count
74+
image_count = db.Column(db.Integer, nullable=True)
75+
76+
# Infobox count
77+
infobox_count = db.Column(db.Integer, nullable=True)
78+
7379
# Template enforcement tracking
7480
# True if template was automatically added to the article during submission
7581
template_added = db.Column(db.Boolean, nullable=True, default=False)
@@ -173,6 +179,8 @@ def __init__(
173179
template_added=False,
174180
categories_added=None,
175181
category_error=None,
182+
image_count=None,
183+
infobox_count=None,
176184
):
177185
"""
178186
Initialize a new Submission instance
@@ -192,6 +200,8 @@ def __init__(
192200
template_added: Whether template was automatically added to article (optional)
193201
categories_added: List of category names that were automatically added (optional, stored as JSON)
194202
category_error: Error message if category attachment failed (optional)
203+
image_count: Number of images in the article (optional)
204+
infobox_count: Number of infoboxes in the article (optional)
195205
"""
196206
# Set required fields
197207
self.user_id = user_id
@@ -219,6 +229,8 @@ def __init__(
219229
else:
220230
self.categories_added = None
221231
self.category_error = category_error
232+
self.image_count = image_count
233+
self.infobox_count = infobox_count
222234
self.reviewed_by = None
223235
self.reviewed_at = None
224236
self.review_comment = None
@@ -530,6 +542,8 @@ def to_dict(self, include_user_info=False):
530542
"article_page_id": self.article_page_id,
531543
"article_size_at_start": self.article_size_at_start,
532544
"article_expansion_bytes": self.article_expansion_bytes,
545+
"image_count": self.image_count,
546+
"infobox_count": self.infobox_count,
533547
"template_added": self.template_added,
534548
"categories_added": self.get_categories_added(),
535549
"category_error": self.category_error,

backend/app/routes/contest_routes.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
check_article_has_category,
3232
append_categories_to_article,
3333
get_article_reference_count,
34+
get_article_image_count,
35+
get_article_infobox_count,
3436
MEDIAWIKI_API_TIMEOUT,
3537
)
3638
from app.services.outreach_dashboard import (
@@ -1314,6 +1316,8 @@ def submit_to_contest(contest_id): # pylint: disable=too-many-return-statements
13141316
article_size_at_start = None
13151317
article_expansion_bytes = None
13161318
article_reference_count = None
1319+
image_count = None
1320+
infobox_count = None
13171321

13181322
# --- Fetch Article Information from MediaWiki API ---
13191323
# MediaWiki API fetching has deep nesting due to complex error handling
@@ -1699,6 +1703,31 @@ def submit_to_contest(contest_id): # pylint: disable=too-many-return-statements
16991703
pass
17001704
article_reference_count = None
17011705

1706+
# --- Fetch Image and Infobox Counts ---
1707+
# These richness metrics are stored for future use but are not used in
1708+
# validation or scoring yet.
1709+
try:
1710+
image_count = get_article_image_count(article_link)
1711+
except Exception as img_error: # pylint: disable=broad-exception-caught
1712+
try:
1713+
current_app.logger.warning(
1714+
"Failed to fetch image count: %s", str(img_error)
1715+
)
1716+
except Exception: # pylint: disable=broad-exception-caught
1717+
pass
1718+
image_count = None
1719+
1720+
try:
1721+
infobox_count = get_article_infobox_count(article_link)
1722+
except Exception as ibx_error: # pylint: disable=broad-exception-caught
1723+
try:
1724+
current_app.logger.warning(
1725+
"Failed to fetch infobox count: %s", str(ibx_error)
1726+
)
1727+
except Exception: # pylint: disable=broad-exception-caught
1728+
pass
1729+
infobox_count = None
1730+
17021731
# --- Validate Article Requirements ---
17031732
# Validate article byte count against contest requirements
17041733
# This check happens after fetching article information from MediaWiki API
@@ -2060,6 +2089,8 @@ def submit_to_contest(contest_id): # pylint: disable=too-many-return-statements
20602089
template_added=template_added,
20612090
categories_added=categories_added,
20622091
category_error=category_error,
2092+
image_count=image_count,
2093+
infobox_count=infobox_count,
20632094
)
20642095

20652096
submission.save()

backend/app/utils/__init__.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
"append_categories_to_article",
4545
"get_article_reference_count",
4646
"get_mediawiki_user_edit_count",
47+
"get_article_image_count",
48+
"get_article_infobox_count",
4749
]
4850

4951

@@ -948,6 +950,63 @@ def _fetch_footnotes_count(api_url: str, page_title: str, headers: dict) -> int:
948950
return 0
949951

950952

953+
def _log_warning(message: str, error: Exception) -> None:
954+
"""Best-effort logging helper that uses Flask current_app when available.
955+
956+
This keeps network helpers free from hard Flask dependencies while still
957+
providing useful diagnostics in a running application.
958+
"""
959+
try:
960+
from flask import current_app
961+
962+
current_app.logger.warning("%s: %s", message, str(error))
963+
except Exception: # pylint: disable=broad-exception-caught
964+
# Logging must never break core logic, so ignore any logging failures
965+
pass
966+
967+
968+
def get_article_image_count(article_url: str) -> Optional[int]:
969+
"""
970+
The count is approximate and based purely on wikitext patterns; it does
971+
not guarantee that every match results in a rendered image, but it
972+
generally tracks user-added content images.
973+
"""
974+
try:
975+
wikitext = get_article_wikitext(article_url)
976+
if wikitext is None:
977+
return None
978+
979+
# Match explicit file/image links like [[File:Example.jpg|...]] or
980+
# [[Image:Example.png|...]] in a case-insensitive way.
981+
matches = re.findall(r'\[\[(?:File|Image):', wikitext, flags=re.IGNORECASE)
982+
return len(matches)
983+
984+
except Exception as error: # pylint: disable=broad-exception-caught
985+
_log_warning("Failed to fetch image count", error)
986+
return None
987+
988+
989+
def get_article_infobox_count(article_url: str) -> Optional[int]:
990+
"""Count approximate number of infobox templates in article wikitext.
991+
992+
Detection is done via a simple regex scan for ``{{infobox ...}}`` in the
993+
raw wikitext. This is an approximation and may over-count or under-count
994+
in edge cases (e.g. nested templates, unusual formatting), but is
995+
sufficient for high-level richness metrics.
996+
"""
997+
try:
998+
wikitext = get_article_wikitext(article_url)
999+
if wikitext is None:
1000+
return None
1001+
1002+
matches = re.findall(r"\{\{\s*infobox\b", wikitext, flags=re.IGNORECASE)
1003+
return len(matches)
1004+
1005+
except Exception as error: # pylint: disable=broad-exception-caught
1006+
_log_warning("Failed to fetch infobox count", error)
1007+
return None
1008+
1009+
9511010
def get_article_reference_count(article_url: str) -> Optional[int]:
9521011
"""
9531012
Get the total number of references in a MediaWiki article.

0 commit comments

Comments
 (0)