Skip to content

Commit b68e14f

Browse files
committed
chg: [crawler] add function to recrawl onion by month or to crawl all onion
1 parent 2f4f2c1 commit b68e14f

File tree

3 files changed

+42
-1
lines changed

3 files changed

+42
-1
lines changed

bin/lib/crawlers.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1895,6 +1895,25 @@ def create_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar=
18951895
external=external, new_task=new_task)
18961896
return task_uuid
18971897

1898+
def recrawl_domain(domain_id):
1899+
domain = Domains.Domain(domain_id)
1900+
parent = domain.get_parent()
1901+
if not parent:
1902+
parent = 'manual'
1903+
task_uuid = create_task(domain.id, parent=parent, priority=0, new_task=True, har=D_HAR, screenshot=D_SCREENSHOT)
1904+
if task_uuid:
1905+
print(task_uuid, domain.id, parent)
1906+
1907+
def recrawl_onion_domains(date_month=None, all_onions_up=False): # TODO RENAME ME
1908+
if all_onions_up:
1909+
to_crawl = Domains.get_domains_up_by_type('onion')
1910+
else:
1911+
if not date_month:
1912+
date_month = Date.get_previous_month_date()
1913+
to_crawl = set(Domains.get_domains_by_month(date_month, ['onion']))
1914+
for onion in to_crawl:
1915+
recrawl_domain(onion)
1916+
18981917
## -- CRAWLER TASK -- ##
18991918

19001919
#### CRAWLER TASK API ####
@@ -2439,6 +2458,7 @@ def change_onion_filter_unknown_state(new_state):
24392458
load_blacklist()
24402459

24412460
# if __name__ == '__main__':
2461+
# recrawl_onion_domains(date_month='202502', all_onions_up=False)
24422462
# delete_captures()
24432463
#
24442464
# item_id = 'crawled/2023/02/20/data.gz'

bin/lib/objects/Domains.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,19 @@ def get_domains_dates_by_daterange(date_from, date_to, domain_types, up=True, do
609609
date_domains[date] = list(domains)
610610
return date_domains
611611

612+
def get_domains_by_month(date_month, domains_types, up=True, down=True):
613+
start = f'{date_month}01'
614+
end = Date.get_month_last_day(date_month)
615+
if 'onion' in domains_types:
616+
domains = get_domains_by_daterange(start, end, 'onion', up=up, down=down)
617+
else:
618+
domains = []
619+
if 'web' in domains_types:
620+
web = get_domains_by_daterange(start, end, 'web', up=up, down=down)
621+
if web:
622+
domains.extend(web)
623+
return domains
624+
612625
def get_domain_up_iterator():
613626
for domain_type in get_all_domains_types():
614627
for dom_id in get_domains_up_by_type(domain_type):

bin/packages/Date.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from dateutil.relativedelta import relativedelta
99

1010
def convert_date_str_to_datetime(date_str):
11-
res = datetime.date(int(date_str[0:4]), int(date_str[4:6]), int(date_str[6:8]))
11+
res = datetime.date(int(date_str[0:4]), int(date_str[4:6]), int(date_str[6:8]))
1212
return res
1313

1414
def get_full_month_str(date_from, date_to):
@@ -281,6 +281,14 @@ def get_previous_month_date():
281281
last_month = first - datetime.timedelta(days=1)
282282
return last_month.strftime("%Y%m%d")
283283

284+
def get_month_last_day(date_month):
285+
month = int(date_month[4:6]) % 12 + 1
286+
return (datetime.date(int(date_month[0:4]), month, 1) - datetime.timedelta(days=1)).strftime("%Y%m%d")
287+
288+
def get_current_month():
289+
dt = datetime.date.today()
290+
return dt.strftime("%Y%m")
291+
284292
def get_current_year():
285293
dt = datetime.date.today()
286294
return dt.strftime("%Y")

0 commit comments

Comments
 (0)