Skip to content

Commit 3d33518

Browse files
Create correlation.sql
1 parent 04ad32d commit 3d33518

File tree

1 file changed

+55
-0
lines changed

1 file changed

+55
-0
lines changed
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
with
2+
raw_data as (
3+
select DATE_REP, CASES,
4+
DEATHS = LAG(DEATHS, 9) OVER (ORDER BY date_rep desc)
5+
from openrowset(bulk 'https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/ecdc_cases/latest/ecdc_cases.parquet',
6+
format='parquet') as a
7+
where geo_id = 'UK'
8+
and date_rep between '2020-03-03' and '2020-06-06'
9+
),
10+
data as ( select x = CAST(CASES AS BIGINT), y = CAST(DEATHS AS INT) FROM raw_data )
11+
select PearsonsR = (Avg(x * y) - (Avg(x) * Avg(y))) / (StDevP(x) * StDevP(y))
12+
from data
13+
14+
15+
16+
with
17+
raw_data as (
18+
select geo_id, date_rep, countries_and_territories,
19+
deaths = AVG(deaths) OVER(partition by geo_id order by date_rep ROWS BETWEEN 3 PRECEDING AND CURRENT ROW ),
20+
cases = AVG(cases) OVER(partition by geo_id order by date_rep desc ROWS BETWEEN 11 PRECEDING AND 7 PRECEDING )
21+
from openrowset(bulk 'https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/ecdc_cases/latest/ecdc_cases.parquet', format='parquet') as a
22+
),
23+
data as (
24+
select
25+
x = CAST(cases AS BIGINT),
26+
y = CAST(deaths AS BIGINT)
27+
from raw_data where cases > 100 and deaths > 10
28+
)
29+
SELECT PearsonsR = (Avg(x * y) - (Avg(x) * Avg(y))) / (StDevP(x) * StDevP(y)),
30+
SpearmanRho = 1 - (6 * SUM(POWER(x - y, 2))) / CONVERT(NUMERIC(36, 2), (COUNT(*) * (POWER(COUNT_BIG(*), 2) - 1)))
31+
FROM data;
32+
33+
--Kendall's rank correlation sample estimate τ
34+
with
35+
raw_data as (
36+
select geo_id, date_rep, countries_and_territories,
37+
deaths = AVG(deaths) OVER(partition by geo_id order by date_rep ROWS BETWEEN 3 PRECEDING AND CURRENT ROW ),
38+
cases = AVG(cases) OVER(partition by geo_id order by date_rep desc ROWS BETWEEN 11 PRECEDING AND 7 PRECEDING )
39+
from openrowset(bulk 'https://pandemicdatalake.blob.core.windows.net/public/curated/covid-19/ecdc_cases/latest/ecdc_cases.parquet', format='parquet') as a
40+
),
41+
data as (
42+
select
43+
x = CAST(cases AS BIGINT),
44+
y = CAST(deaths AS BIGINT),
45+
class = geo_id,
46+
id = date_rep
47+
from raw_data where cases > 100 and deaths > 10
48+
)
49+
SELECT
50+
CONVERT(NUMERIC(8,2),(SUM(CASE WHEN (i.x < j.x AND i.y < j.y) OR (i.x > j.x AND i.y > j.y) THEN 1 ELSE 0 END)) -- concordant
51+
- SUM(CASE WHEN (i.x < j.x AND i.y > j.y) OR (i.x > j.x AND i.y < j.y) THEN 1 ELSE 0 END)) -- discordant
52+
/COUNT(*) AS Tau
53+
FROM data i CROSS JOIN data j
54+
WHERE i.class = j.class
55+
AND i.id<>j.id

0 commit comments

Comments
 (0)