Skip to content

Commit 590a48c

Browse files
Update data-exploration.sql
1 parent c367fea commit 590a48c

File tree

1 file changed

+102
-41
lines changed

1 file changed

+102
-41
lines changed
Lines changed: 102 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11

2+
select top 10 *
3+
from openrowset(bulk 'https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/ecdc_cases/latest/ecdc_cases.csv',
4+
format='csv', parser_version='2.0') as a
5+
6+
-- Use HEADER_ROW because this file has header
27
select top 10 *
38
from openrowset(bulk 'https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/ecdc_cases/latest/ecdc_cases.csv',
49
format='csv', parser_version='2.0', FIRSTROW = 2) as a
@@ -13,51 +18,107 @@ select top 10 *
1318
from openrowset(bulk 'https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/ecdc_cases/latest/ecdc_cases.parquet',
1419
format='parquet') as a
1520

21+
-- ## Explore your data
22+
-- As a first step we need to explore data in the file place in Azure storage using `OPENROWSET` function:
1623

17-
select continent = ISNULL(continent_exp, 'Total'), cases = sum(cases), deaths = sum(deaths)
18-
from openrowset(bulk 'https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/ecdc_cases/latest/ecdc_cases.parquet',
19-
format='parquet') as cases
20-
group by continent_exp with rollup
21-
order by sum(cases) desc
24+
select top 10 *
25+
from openrowset(bulk 'https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/ecdc_cases/latest/ecdc_cases.parquet',
26+
format='parquet') as a
2227

23-
select countries_and_territories, geo_id
24-
from openrowset(bulk 'https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/ecdc_cases/latest/ecdc_cases.parquet',
25-
format='parquet') as cases
26-
where countries_and_territories like '%ser%'
2728

29+
-- Here we can see that some of the columns interesting for analysis are `DATE_REP` and `CASES`. I would like to analyze number of cases reported in Serbia, so I would need to filter the results using `GEO_ID` column.
30+
-- We are not sure what is `geo_id` value for Serbia, so we will find all distinct countries and `geo_id` values where country is something like Serbia:
2831

29-
select DATE_REP, CASES, DEATHS
30-
from openrowset(bulk 'https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/ecdc_cases/latest/ecdc_cases.parquet',
31-
format='parquet') as a
32-
where geo_id = 'RS'
32+
select distinct countries_and_territories, geo_id
33+
from openrowset(bulk 'https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/ecdc_cases/latest/ecdc_cases.parquet',
34+
format='parquet') as a
35+
where countries_and_territories like '%Ser%'
36+
37+
38+
-- Since we see that `GEO_ID` for Serbia is `RS`, we can find dayly number of cases in Serbia:
39+
select DATE_REP, CASES
40+
from openrowset(bulk 'https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/ecdc_cases/latest/ecdc_cases.parquet',
41+
format='parquet') as a
42+
where geo_id = 'RS'
3343
order by date_rep
3444

35-
-- cumulative values - running total:
36-
select DATE_REP, CASES,
37-
CUMULATIVE = SUM(CASES) OVER (ORDER BY date_rep)
38-
from openrowset(bulk 'https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/ecdc_cases/latest/ecdc_cases.parquet',
39-
format='parquet') as a
40-
where geo_id = 'RS'
41-
order by date_rep;
42-
43-
select DATE_REP,
44-
CASES,
45-
CASES_AVG = AVG(CASES) OVER(order by date_rep ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING )
46-
from openrowset(bulk 'https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/ecdc_cases/latest/ecdc_cases.parquet', format='parquet') as a
47-
where geo_id = 'RS' order by date_rep;
48-
49-
with diff as (
50-
select geo_id, date_rep, countries_and_territories,
51-
current_avg = AVG(CASES) OVER(partition by geo_id order by date_rep ROWS BETWEEN 7 PRECEDING AND CURRENT ROW ),
52-
prev_avg = AVG(CASES) OVER(partition by geo_id order by date_rep ROWS BETWEEN 14 PRECEDING AND 7 PRECEDING )
53-
from openrowset(bulk 'https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/ecdc_cases/latest/ecdc_cases.parquet', format='parquet') as a
54-
)
55-
select country = countries_and_territories,
56-
[cases/day (this week)] = current_avg,
57-
[cases/day (prev week)] = prev_avg,
58-
[change%] = CAST( 100*(1.*current_avg / prev_avg - 1) AS NUMERIC(4,1))
59-
from diff
60-
where date_rep = CAST('2020-10-04T00:00:00.0000000' as datetime2)
61-
and current_avg > prev_avg
62-
and prev_avg > 100
45+
46+
-- We can show this in the chart to see trend analysis of reported COVID cases in Serbia. By looking at this chart, we can see that the peek is somewhere between 15th and 20th April and the peak in the second wave is second half of July.
47+
-- The points on time series charts are shown per daily basis. This might lead to daily variation, so you might want to show the graph with average values calculated in the window with +/- 1-2 days. T-SQL enables you to easily calculate average values if you specify time window:
48+
-- ```
49+
-- AVG(CASES) OVER(order by date_rep ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING )
50+
-- ```
51+
-- We need to specify how to locally order data and number of preceding/following rows that AVG function should use to calculate the average value within the window. The time series query that uses average values is shown on the following code:
52+
53+
select DATE_REP,
54+
CASES_AVG = AVG(CASES) OVER(ORDER BY date_rep ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING )
55+
from openrowset(bulk 'https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/ecdc_cases/latest/ecdc_cases.parquet', format='parquet') as a
56+
where geo_id = 'RS'
57+
order by date_rep
58+
59+
60+
-- We can also show cumulative values to see increase of the number of cases over time (this is known as running total):
61+
62+
select DATE_REP,
63+
CUMULATIVE = SUM(CASES) OVER (ORDER BY date_rep)
64+
from openrowset(bulk 'https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/ecdc_cases/latest/ecdc_cases.parquet',
65+
format='parquet') as a
66+
where geo_id = 'RS'
67+
order by date_rep
68+
69+
70+
-- If we switch to chart we can see cumulative number of cases that are reported since the first COVID case.
71+
-- SQL language enables us to easily lookup number of reported cases couple of days after or before using LAG and LEAD functions. the following query will return number of cases reported 7 days ago:
72+
73+
select TOP 10 date_rep,
74+
cases,
75+
prev = LAG(CASES, 7) OVER(partition by geo_id order by date_rep )
76+
from openrowset(bulk 'https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/ecdc_cases/latest/ecdc_cases.parquet',
77+
format='parquet') as a
78+
where geo_id = 'RS'
79+
order by date_rep desc;
80+
81+
82+
-- You can notice in the result that prev column lag 7 days to the current column. Now we can easily compare the difference between the current number of reported cases of the number of reported cases reported or percent of increase:
83+
-- ```
84+
-- WoW% = (cases - prev) / prev
85+
-- = cases/prev - 1
86+
-- ```
87+
-- Instead of simple comparison of current and previous value, we can make this more reliable and first calculate the average values in the 7-day windows and then calculate increase using these values:
88+
89+
with ecdc as (
90+
select
91+
date_rep,
92+
cases = AVG(CASES) OVER(partition by geo_id order by date_rep ROWS BETWEEN 7 PRECEDING AND CURRENT ROW ),
93+
prev = AVG(CASES) OVER(partition by geo_id order by date_rep ROWS BETWEEN 14 PRECEDING AND 7 PRECEDING )
94+
from
95+
openrowset(bulk 'https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/ecdc_cases/latest/ecdc_cases.parquet',
96+
format='parquet') as a
97+
where
98+
geo_id = 'RS'
99+
)
100+
select date_rep, cases, prev, [WoW%] = 100*(1.0*cases/prev - 1)
101+
from ecdc
102+
where prev > 10
103+
order by date_rep asc;
104+
105+
-- This query will calculate the average number of cases in 7-day window and calculate week over week change.
106+
-- We can go step further and use the same query to run analysis across all countries in the world to calculate weekly changes and find the countries with the highest increase of COVID cases compared to the previous week.
107+
108+
109+
with weekly_cases as (
110+
select geo_id, date_rep, country = countries_and_territories,
111+
current_avg = AVG(CASES) OVER(partition by geo_id order by date_rep ROWS BETWEEN 7 PRECEDING AND CURRENT ROW ),
112+
prev_avg = AVG(CASES) OVER(partition by geo_id order by date_rep ROWS BETWEEN 14 PRECEDING AND 7 PRECEDING )
113+
from openrowset(bulk 'https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/ecdc_cases/latest/ecdc_cases.parquet',
114+
format='parquet') as a
115+
)
116+
select top 10
117+
country,
118+
current_avg,
119+
prev_avg,
120+
[WoW%] = CAST((100*(1.* current_avg / prev_avg - 1)) AS smallint)
121+
from weekly_cases
122+
where date_rep = CONVERT(date, DATEADD(DAY, -1, GETDATE()), 23)
123+
and prev_avg > 100
63124
order by (1. * current_avg / prev_avg -1) desc

0 commit comments

Comments
 (0)