Skip to content

Commit 6919f76

Browse files
committed
Tweak to get number of articles for each word, not just occurrences
1 parent fd71a81 commit 6919f76

File tree

2 files changed

+80
-7
lines changed

2 files changed

+80
-7
lines changed

db/sql.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,3 +464,59 @@ def frequency(
464464
datefmt=datefmt,
465465
)
466466
return result
467+
468+
469+
class ArticleFrequencyQuery(_BaseQuery):
470+
"""A query yielding the number of articles a given word occurs in
471+
over a given period of time, broken down by either
472+
day or week."""
473+
474+
_Q = """
475+
with days as (
476+
select to_char(d, :datefmt) date
477+
from generate_series(
478+
:start,
479+
:end,
480+
:timeunit
481+
) d
482+
),
483+
appearances as (
484+
select to_char(a.timestamp, :datefmt) date, count(distinct w.article_id) cnt
485+
from words w, articles a
486+
where w.stem = :stem
487+
and w.cat = :cat
488+
and w.article_id = a.id
489+
and a.timestamp >= :start
490+
and a.timestamp <= :end
491+
group by date
492+
order by date
493+
)
494+
select days.date, coalesce(appearances.cnt,0) from days
495+
left outer join appearances on days.date = appearances.date;
496+
"""
497+
498+
@classmethod
499+
def frequency(
500+
cls,
501+
stem: str,
502+
cat: str,
503+
start: datetime,
504+
end: datetime,
505+
timeunit: str = "day",
506+
enclosing_session: Optional[Session] = None,
507+
) -> Iterable[Any]:
508+
result: Iterable[Any] = []
509+
with SessionContext(session=enclosing_session, read_only=True) as session:
510+
assert timeunit in ["week", "day"]
511+
datefmt = "IYYY-IW" if timeunit == "week" else "YYYY-MM-DD"
512+
tu = f"1 {timeunit}"
513+
result = cls().execute(
514+
session,
515+
stem=stem,
516+
cat=cat,
517+
start=start,
518+
end=end,
519+
timeunit=tu,
520+
datefmt=datefmt,
521+
)
522+
return result

get_word_frequencies.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import csv
22
from datetime import datetime, timedelta, date, time
3-
from db.sql import WordFrequencyQuery
3+
from db.sql import WordFrequencyQuery, ArticleFrequencyQuery
44
from settings import Settings
55

66
def get_frequencies_to_csv_per_word():
@@ -14,10 +14,10 @@ def get_frequencies_to_csv_per_word():
1414
# Set the end date to the end of yesterday to exclude the current, unfinished day.
1515
today = date.today()
1616
end_date = datetime.combine(today, time.min) - timedelta(seconds=1)
17-
start_date = end_date - timedelta(days=7 * 365) # 7 years back
17+
start_date = end_date - timedelta(days=(7 * 365) - 1)
1818

1919
words_to_check = [
20-
# ("stýrivextir", "kk"),
20+
("stýrivextir", "kk"),
2121
("verðbólga", "kvk")
2222
]
2323

@@ -27,19 +27,36 @@ def get_frequencies_to_csv_per_word():
2727

2828
with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
2929
csv_writer = csv.writer(csvfile)
30-
csv_writer.writerow(['date', 'count'])
30+
csv_writer.writerow(['date', 'count', 'article_count'])
3131

32-
results = WordFrequencyQuery.frequency(
32+
word_freq = WordFrequencyQuery.frequency(
3333
stem=stem,
3434
cat=cat,
3535
start=start_date,
3636
end=end_date,
3737
timeunit="day"
3838
)
3939

40+
article_freq = ArticleFrequencyQuery.frequency(
41+
stem=stem,
42+
cat=cat,
43+
start=start_date,
44+
end=end_date,
45+
timeunit="day"
46+
)
47+
48+
# Create dictionaries for quick lookup
49+
word_freq_dict = {d: c for d, c in word_freq}
50+
article_freq_dict = {d: c for d, c in article_freq}
51+
52+
# Get all unique dates from both queries
53+
all_dates = sorted(list(set(word_freq_dict.keys()) | set(article_freq_dict.keys())))
54+
4055
rows_written = 0
41-
for d, count in results:
42-
csv_writer.writerow([d, count])
56+
for d in all_dates:
57+
count = word_freq_dict.get(d, 0)
58+
article_count = article_freq_dict.get(d, 0)
59+
csv_writer.writerow([d, count, article_count])
4360
rows_written += 1
4461

4562
if rows_written > 0:

0 commit comments

Comments
 (0)