Skip to content
This repository was archived by the owner on Jun 30, 2024. It is now read-only.

Commit 839a6d8

Browse files
committed
A nightly script to append to page_views
1 parent 58d6364 commit 839a6d8

File tree

1 file changed

+169
-0
lines changed

1 file changed

+169
-0
lines changed

scripts/buildPageViews.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
#!/usr/bin/env python
2+
# coding: utf-8
3+
4+
# # Create / update a page view table
5+
#
6+
# * Course
7+
# * Base Course
8+
# * chapter
9+
# * subchapter
10+
# * timestamp
11+
# * sid
12+
# * useinfo id
13+
#
14+
# --
15+
# * chapter name?
16+
# * subchapter name?
17+
# * chapter number?
18+
# * subchapter number?
19+
#
20+
21+
# In[104]:
22+
23+
24+
import pandas as pd
25+
from sqlalchemy import create_engine
26+
import datetime
27+
import os
28+
29+
eng = create_engine(os.environ["DBURL"])
30+
31+
32+
# In[105]:
33+
34+
35+
TIMEFRAME = "2022-08-01"
36+
TF = datetime.datetime(2022, 8, 1)
37+
LY = TF - datetime.timedelta(days=365)
38+
39+
40+
# In[106]:
41+
42+
43+
tmp = pd.read_sql_query("select max(timestamp) as last_ts from page_views", eng)
44+
45+
46+
# In[107]:
47+
48+
49+
print(tmp.last_ts[0])
50+
51+
52+
# In[108]:
53+
54+
55+
get_ipython().run_cell_magic(
56+
"time",
57+
"",
58+
"pages = pd.read_sql_query(f\"\"\"select * from useinfo\n join courses on useinfo.course_id = courses.course_name\n where useinfo.timestamp > %(last_ts)s\n and courses.term_start_date >= %(start)s\n and event = 'page'\n \"\"\", params={'last_ts': tmp.last_ts[0], 'start': TF},\n con=eng, parse_dates=['term_start_date','timestamp'])\n",
59+
)
60+
61+
62+
# In[109]:
63+
64+
65+
len(pages)
66+
67+
68+
# In[110]:
69+
70+
71+
def get_chapter(divid):
72+
parts = divid.split("/")
73+
if len(parts) >= 2:
74+
return parts[-2]
75+
76+
77+
def get_subchapter(divid):
78+
parts = divid.split("/")
79+
if len(parts) >= 2:
80+
return parts[-1].replace(".html", "")
81+
82+
83+
# In[111]:
84+
85+
86+
pages["chapter"] = pages.div_id.map(get_chapter)
87+
pages["subchapter"] = pages.div_id.map(get_subchapter)
88+
pages["time_from_start"] = pages.timestamp - pages.term_start_date
89+
pages["week"] = pages.time_from_start.dt.days // 7
90+
pages = pages[pages.week >= 0]
91+
92+
93+
# In[112]:
94+
95+
96+
print(len(pages))
97+
98+
99+
# In[ ]:
100+
101+
102+
# In[113]:
103+
104+
105+
titles = pd.read_sql_query(
106+
"""select * from chapters join sub_chapters on chapters.id = sub_chapters.chapter_id """,
107+
eng,
108+
)
109+
110+
111+
# The next step needs to be different for PTX books and Runestone books...
112+
# 1. For Runestone books I can deduce the chapter and subchapter from the URL in div_id
113+
# 2. For ptx books I can deduce the sub_chapter id from the URL and would want to merge with the basecourse+subchap
114+
115+
# In[114]:
116+
117+
118+
mp = pages.merge(
119+
titles,
120+
left_on=["base_course", "chapter", "subchapter"],
121+
right_on=["course_id", "chapter_label", "sub_chapter_label"],
122+
)
123+
124+
125+
# In[115]:
126+
127+
128+
mp = mp[
129+
[
130+
"timestamp",
131+
"term_start_date",
132+
"week",
133+
"course_name",
134+
"base_course",
135+
"chapter",
136+
"subchapter",
137+
"courselevel",
138+
"chapter_name",
139+
"sub_chapter_name",
140+
]
141+
]
142+
143+
144+
# In[116]:
145+
146+
147+
mp.to_sql(
148+
"page_views", eng, if_exists="append", index=False, method="multi", chunksize=10000
149+
)
150+
151+
152+
# In[117]:
153+
154+
155+
# mp.to_csv("page_views.csv", index=False)
156+
157+
158+
# In[118]:
159+
160+
161+
# mp.sort_values('timestamp')
162+
163+
164+
# ```sql
165+
# create index chapter_id on page_views using btree(chapter);
166+
# create index sub_chapter_id on page_views using btree(subchapter);
167+
# create index base_course_idx on page_views using btree(base_course);
168+
# ```
169+
#

0 commit comments

Comments
 (0)