Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
75f73f8
Minimal working setup
kelvinkipruto Apr 24, 2024
b92ef3e
Working version with DB
kelvinkipruto May 6, 2024
a4541de
Cleanup
kelvinkipruto May 6, 2024
aec1820
Run time improvements
kelvinkipruto May 6, 2024
8c0b06f
Remove unused imports
kelvinkipruto May 6, 2024
95dae7f
Merge branch 'main' of https://github.com/CodeForAfrica/api into ft/m…
kelvinkipruto May 6, 2024
9e17c89
Docker files
kelvinkipruto May 6, 2024
1469485
validate robots.txt
kelvinkipruto May 7, 2024
1e1c00d
Improve script to capture extra required fields
kelvinkipruto May 14, 2024
3140ecb
Rename to content_access_bot
kelvinkipruto May 14, 2024
906ba75
use case insensitivity when matching crawlers
kelvinkipruto May 17, 2024
e1dd2e4
Improve url redirects check
kelvinkipruto May 17, 2024
f74769b
Update list of crawlers
kelvinkipruto May 17, 2024
73a0031
use environs instead of dotenv
kelvinkipruto May 17, 2024
d8981e1
Misc improvements
kelvinkipruto May 17, 2024
883a8ab
Code changes
kelvinkipruto Oct 4, 2024
b551b3e
Working Update
kelvinkipruto Jun 13, 2025
09bc272
Refactor database imports to use sqliteDB module
kelvinkipruto Jun 17, 2025
f13a25c
Improve script reliability
kelvinkipruto Jun 17, 2025
782b921
Fix SQL table definition to allow NULL values for archived robots fields
kelvinkipruto Jun 18, 2025
a2761a5
Simplified working scrapper
kelvinkipruto Jun 19, 2025
a1d7374
Update interpreter constraints to include Python 3.10
kelvinkipruto Jun 19, 2025
df6e7a3
Enhance database connection timeout and improve robots fetching logic
kelvinkipruto Jun 24, 2025
b3352ff
refactor(db): implement site checks tracking system
kelvinkipruto Sep 5, 2025
7ab4278
Merge branch 'main' into ft/midiadata-init
kelvinkipruto Sep 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion mediadata_ai_blocklist/py/airtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,6 @@ async def batch_upsert_organizations(data):
logging.info('Upserting organizations in Airtable')
try:
table = at.table(base_id, content_table)
table.batch_upsert(records=data, key_fields=['URL',])
table.batch_upsert(records=data, key_fields=['id',])
except Exception as e:
logging.error(f'Error upserting organization: {e}')
43 changes: 40 additions & 3 deletions mediadata_ai_blocklist/py/database.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import sqlite3
from dataclasses import dataclass
from sqlite3 import Error
from typing import List
from dotenv import load_dotenv
import os

Expand All @@ -15,6 +14,10 @@ class MediaHouse:
url: str
airtable_id: str
id: str = None
site_status: str = None
site_reachable: bool = None
site_redirect: bool = None
final_url: str = None


@dataclass
Expand Down Expand Up @@ -56,8 +59,12 @@ def create_table(self):
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
country TEXT NOT NULL,
url TEXT NOT NULL UNIQUE,
airtable_id TEXT NOT NULL UNIQUE
url TEXT NOT NULL,
airtable_id TEXT NOT NULL UNIQUE,
site_status TEXT,
site_reachable BOOLEAN,
site_redirect BOOLEAN,
final_url TEXT
);
CREATE TABLE IF NOT EXISTS robots (
id INTEGER PRIMARY KEY AUTOINCREMENT,
Expand Down Expand Up @@ -117,6 +124,36 @@ def select_all_media_houses(self):
finally:
cur.close()

def update_site_status(self, media_house_id, site_status, site_reachable, site_redirect, final_url):
try:
sql = """
UPDATE media_house
SET site_status = ?, site_reachable = ?, site_redirect = ?, final_url = ?
WHERE id = ?
"""
cur = self.conn.cursor()
cur.execute(sql, (site_status, site_reachable,
site_redirect, final_url, media_house_id))
self.conn.commit()
except Error as e:
print(e)
finally:
cur.close()

def get_reachable_sites(self):
try:
cur = self.conn.cursor()
cur.execute("SELECT * FROM media_house WHERE site_reachable = 1")
rows = cur.fetchall()
column_names = [column[0] for column in cur.description]
dict_rows = [dict(zip(column_names, row)) for row in rows]
return dict_rows
except Error as e:
print(e)
return None
finally:
cur.close()

def close_connection(self):
self.conn.close()

Expand Down
3 changes: 3 additions & 0 deletions mediadata_ai_blocklist/py/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ def diff_robot_files(media_house: MediaHouse, db: Database):
data['blocks_crawlers'] = True if found_crawlers else False
data['notes'] = 'Robots.txt has been updated to block AI crawlers' if found_crawlers and not archive_crawlers else None
data['latest_robots_url'] = latest_robots['url']
data['latest_robots_date'] = latest_robots['timestamp']
data['latest_robots_content'] = latest_robots['content']
data['archived_robots_url'] = oldest_archived_robots['url']
data['archived_date'] = oldest_archived_robots['archived_date']
data['archived_robots_content'] = oldest_archived_robots['content']
return data
65 changes: 58 additions & 7 deletions mediadata_ai_blocklist/py/main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio
from yarl import URL
import random
import aiohttp
from airtable import get_organizations, batch_upsert_organizations
Expand All @@ -15,27 +16,48 @@


async def update_airtable(db: Database):
all_orgs = db.select_all_media_houses()
all_orgs = db.get_reachable_sites()
data_update = []
for org in all_orgs:
diff_data = diff_robot_files(org, db)
if (diff_data):
update_data = {
"fields": {
"URL": org['url'],
"Organisation Name": org['name'],
"id": org['airtable_id'],
"Blocks AI Crawlers": diff_data['blocks_crawlers'],
"Blocked Crawlers": diff_data['crawler'],
"Current Robots": diff_data['latest_robots_url'],
"Archived Robots": diff_data['archived_robots_url'],
"Current Robots URL": diff_data['latest_robots_url'],
"Checked": datetime.datetime.strptime(diff_data['latest_robots_date'], "%Y%m%d%H%M%S").date().isoformat(),
"Current Robots Content": diff_data['latest_robots_content'],
"Archived Robots URL": diff_data['archived_robots_url'],
"Archive Date": datetime.datetime.strptime(diff_data['archived_date'], "%Y%m%d%H%M%S").date().isoformat(),
"Archived Robots Content": diff_data['archived_robots_content'],
}
}
data_update.append(update_data)

await batch_upsert_organizations(data_update)


async def update_airtable_site_status(db: Database):
all_orgs = db.select_all_media_houses()
data_update = []
for org in all_orgs:
update_data = {
"fields": {
"id": org['airtable_id'],
"Organisation": [org['airtable_id']],
"URL": org['url'],
"Reachable": bool(org['site_reachable']),
"Redirects": bool(org['site_redirect']),
"Final URL": org['final_url'],
}
}
data_update.append(update_data)

await batch_upsert_organizations(data_update)


async def fetch_orgs(db: Database):
organizations = get_organizations()
for media_house in organizations:
Expand All @@ -44,8 +66,27 @@ async def fetch_orgs(db: Database):
db.insert_media_house(media_house_obj)


async def check_site_availability(url: str):
async with aiohttp.ClientSession() as session:
try:
async with session.get(url, allow_redirects=True) as response:
return {
"status_code": response.status,
"reachable": True,
"redirect": URL(response.url).with_scheme('').with_path(response.url.path.rstrip('/')) != URL(url).with_scheme('').with_path(URL(url).path.rstrip('/')),
"final_url": str(response.url)
}
except Exception:
return {
"status_code": None,
"reachable": False,
"redirect": False,
"final_url": None
}


async def fetch_robots(db: Database):
media_houses = db.select_all_media_houses()
media_houses = db.get_reachable_sites()
async with aiohttp.ClientSession() as session:
tasks = [asyncio.create_task(fetch_current_robots(
db, session, media_house)) for media_house in media_houses]
Expand All @@ -54,16 +95,26 @@ async def fetch_robots(db: Database):


async def fetch_archived_robots(db: Database):
media_houses = db.select_all_media_houses()
media_houses = db.get_reachable_sites()
async with aiohttp.ClientSession() as session:
tasks = [asyncio.create_task(fetch_past_robots(
db, session, media_house)) for media_house in media_houses]
await asyncio.gather(*tasks)
await asyncio.sleep(random.uniform(1, 3))


async def check_org_sites(db: Database):
all_orgs = db.select_all_media_houses()
for org in all_orgs:
site_status = await check_site_availability(org['url'])
db.update_site_status(org['id'], site_status['status_code'],
site_status['reachable'], site_status['redirect'], site_status['final_url'])


async def main(db: Database):
await fetch_orgs(db)
await check_org_sites(db)
await update_airtable_site_status(db)
await asyncio.gather(fetch_robots(db), fetch_archived_robots(db))
await update_airtable(db)

Expand Down
6 changes: 0 additions & 6 deletions mediadata_ai_blocklist/py/robots.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
import asyncio
import re
import aiohttp
Expand Down Expand Up @@ -123,10 +122,8 @@ async def fetch_current_robots(db: Database, session: aiohttp.ClientSession, med
try:
text = await fetch_robots(session, url)
if text:
print("Valid robots.txt")
robots = Robots(media_house['id'], robots_url,
datetime.now().strftime("%Y%m%d%H%M%S"), text, "200")
print(robots)
db.insert_robot(robots)
await asyncio.sleep(random.uniform(1, 3))
except Exception as e:
Expand All @@ -153,7 +150,6 @@ async def fetch_past_robots(db: Database, session: aiohttp.ClientSession, media_
return
snapshots = await fetch_internet_archive_snapshots(session, media_house['url'])
if snapshots:
print("Snapshots")
one_year_ago = (datetime.now() - timedelta(days=past_days)
).strftime("%Y%m%d%H%M%S")
closest_snapshot = find_closest_snapshot(snapshots, one_year_ago)
Expand All @@ -166,10 +162,8 @@ async def fetch_past_robots(db: Database, session: aiohttp.ClientSession, media_
media_house['name']}: {closest_snapshot_url}""")
archive_robots = await fetch_robots(session, closest_snapshot_url)
if archive_robots:
print("Valid robots.txt")
archive_robots = ArchivedRobots(media_house['id'], closest_snapshot_url,
closest_snapshot['timestamp'], archive_robots, datetime.now().strftime("%Y%m%d%H%M%S"), "200")
print(archive_robots)
db.insert_archived_robot(archive_robots)
await asyncio.sleep(random.uniform(1, 3))
else:
Expand Down