Skip to content

Commit b0489d7

Browse files
committed
export small tweak and central processing update
1 parent 26481bb commit b0489d7

File tree

4 files changed

+194
-152
lines changed

4 files changed

+194
-152
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: DataQuality
22
Title: Data Quality study in OHDSI and data quality evaluation
3-
Version: 4.0
3+
Version: 4.1
44
Authors@R: "Vojtech Huser <vojtech.huser@nih.gov> [aut, cre]"
55
Description: This package has several functions. It supports Data Quality Dashboard and OHDSI Data Quality study.
66
It also allows Data Quality evaluation (even after closure of the formal study).

extras/CentralProcessing.R

Lines changed: 0 additions & 144 deletions
Original file line numberDiff line numberDiff line change
@@ -29,147 +29,3 @@ ttd<-combHeel %>% left_join(lkup_rules)
2929
ttb2<-ttb %>% left_join(lkup_rules)
3030

3131

32-
33-
34-
#DQD Centralized processing
35-
#----------------------------------------
36-
#----------------------------------------
37-
#this script assummes coordinating center infrustructure for loading
38-
#athena dictionaries (.rda files istead of in a database)
39-
#now truly doing 2 sites
40-
41-
#load athena dictionary
42-
library(tidyverse);library(magrittr);options(tibble.print_max = 200)
43-
load('o:/athena/concept.rda')
44-
45-
#lkup<-concept %>% filter(vocabulary_id %in% c('CPT4','ICD9Proc','CDT','HCPCS','ICD9CM','ICD10CM','ICD10PCS'))
46-
47-
#reading a single site data (for now)
48-
f<-'d:/OneDrive - National Institutes of Health/temp/dqd/export'
49-
f<-'d:/OneDrive - National Institutes of Health/ohdsi/thresholds'
50-
51-
sfiles<-c(file.path(f,'1ThresholdsA.csv'))
52-
sfiles<-c(file.path(f,'test-ThresholdsA.csv'))
53-
sfiles<-c(file.path(f,'1ThresholdsA.csv'),file.path(f,'ThresholdsA.csv'),file.path(f,'test-ThresholdsA.csv'))
54-
ll<-map(sfiles,read_csv)
55-
ll
56-
57-
#ll<-map(p$pid,doProperty())
58-
#strip name from full path trick
59-
ll2<-map2(ll,basename(sfiles),~mutate(.x,site=.y))
60-
d<-bind_rows(ll2)
61-
62-
#add terminology concepts
63-
sconcept<-concept %>% select(concept_id,concept_name)
64-
names(d) <- tolower(names(d))
65-
names(d)
66-
#remove no units rows and expand the CIDs
67-
d2<-d %>% filter(stratum_1 != 0) %>% filter(stratum_2 != 0) %>% left_join(sconcept,by=c('stratum_1'='concept_id')) %>%
68-
left_join(sconcept,by=c('stratum_2'='concept_id'))
69-
names(d2)
70-
71-
#remove columns that are not needed
72-
# d3<-d2 %>% select(-stratum_3,-stratum_4,-stratum_5,-p25_value,-p75_value) %>%
73-
# filter(count_value >=100 ) %>% arrange(stratum_1,desc(count_value) )
74-
75-
d3<-d2 %>% select(-stratum_3,-stratum_4,-stratum_5) %>%
76-
arrange(stratum_1,desc(count_value) )
77-
78-
79-
d3 %>% count(site)
80-
names(d3)
81-
ba<-d3 %>% group_by(stratum_1,stratum_2,concept_name.x,concept_name.y) %>% summarize(tcnt=sum(count_value),n=n())
82-
ba %>% filter(n>=2)
83-
84-
85-
86-
#24 test-unit pairs have 2 results
87-
88-
89-
90-
#tests with more units
91-
d3 %>% count(stratum_1)
92-
93-
94-
#only where multiple sites
95-
d10<-d3 %>% inner_join(ba %>% filter(n>=2))
96-
97-
98-
99-
100-
101-
#even more removal of data
102-
d4<-d3 %>% select(-count_value,-median_value,-stdev_value,-avg_value,-site)
103-
104-
d4 %>% write_csv('extras/DqdResults/thresholds-list-A.csv')
105-
nrow(d4)
106-
107-
108-
#read DD checks
109-
library(stats);library(tidyverse);library(magrittr)
110-
#message("\n*** Successfully loaded .Rprofile ***\n")
111-
112-
113-
url='https://raw.githubusercontent.com/OHDSI/DataQualityDashboard/master/inst/csv/OMOP_CDMv5.3.1_Concept_Level.csv'
114-
dqd<-read_csv(url)
115-
str(dqd)
116-
names(dqd)
117-
dqd %>% dplyr::filter(cdmTableName=='MEASUREMENT' & cmdFieldName=='MEASUREMENT_CONCEPT_ID' )
118-
dqd %>% dplyr::filter(cdmFieldName=='MEASUREMENT_CONCEPT_ID' ) %>% nrow()
119-
dqd %>% count(cdmTableName,cdmFieldName)
120-
121-
122-
#compare data driven and expert drive sets
123-
d$STRATUM_1 %<>% as.integer()
124-
dqd$unitConceptId %<>% as.integer()
125-
expert <-dqd %>% dplyr::filter(cdmFieldName=='MEASUREMENT_CONCEPT_ID' )
126-
nrow(expert)
127-
names(expert)
128-
elabs<-expert %>% group_by(conceptId,conceptName) %>% summarise(unitcnt=n(),units=paste(unitConceptName,collapse = "|"))
129-
130-
#330 lab tests
131-
elabs %>% write_csv('extras/DqdResults/DQD-expert-driven-A-lab-list.csv')
132-
133-
names(expert)
134-
ddriven<-d %>% rename(conceptId=STRATUM_1,unitConceptId=STRATUM_2) %>% select(conceptId,unitConceptId) %>% unique()
135-
136-
names(d2)
137-
ddriven<-d %>% rename(conceptId=STRATUM_1,unitConceptId=STRATUM_2)
138-
ddriven<-d2 %>% rename(conceptId=stratum_1,unitConceptId=stratum_2)
139-
#ddriven %<>% filter(conceptId!=0)
140-
#ddriven %<>% filter(unitConceptId!=0)
141-
142-
over=expert %>% inner_join(ddriven) #58 overlapping
143-
View(over)
144-
expert %>% anti_join(ddriven) #827 are in expert but not in data
145-
146-
not1<-ddriven %>% anti_join(expert) #14 are in data and not in expert
147-
148-
149-
#compare the trehsholds
150-
names(over)
151-
over %>% select(conceptName,unitConceptName,plausibleValueLow,min_value)
152-
over %>% select(conceptName,unitConceptName,plausibleValueHigh,max_value)
153-
#%>% knitr::kable()
154-
155-
156-
#expert thresholds don't follow unit conversion logic (max and min is same even if units indicate order of magniture difference)
157-
#MEASUREMENT MEASUREMENT_CONCEPT_ID 3013721 Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma 8713 gram per deciliter 5 5 2000 5 NA NA NA NA NA NA NA NA
158-
#MEASUREMENT MEASUREMENT_CONCEPT_ID 3013721 Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma 8840 milligram per deciliter 5 5 2000
159-
160-
#5g/dL into mg/dL (is 5000 mg/dL)
161-
#in data is in fact unit/L
162-
163-
164-
165-
#unitmorph
166-
#Protein [Mass/volume] in Serum or Plasma 7096851 4 gram per deciliter|unit|milligram per deciliter|gram per liter
167-
# gram per deciliter| |milligram per deciliter | gram per liter
168-
names(d3)
169-
bb<-d3 %>% filter(site=='ThresholdsA.csv') %>% group_by(stratum_1,concept_name.x) %>%
170-
summarize(tcnt=sum(count_value)
171-
,n=n(),units=paste(concept_name.y,collapse = '|')
172-
,cnts=paste(count_value ,collapse = '|')
173-
,unitcids=paste(stratum_2,collapse = '|')
174-
)
175-
bb %>% write_csv('local/morphA.csv')
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
2+
3+
#DQD Centralized processing
4+
#----------------------------------------
5+
#----------------------------------------
6+
#this script assummes coordinating center infrustructure for loading
7+
#athena dictionaries (.rda files istead of in a database)
8+
#now truly doing 2 sites
9+
10+
#load athena dictionary
11+
library(tidyverse);library(magrittr);options(tibble.print_max = 200)
12+
load('o:/athena/concept.rda')
13+
14+
#lkup<-concept %>% filter(vocabulary_id %in% c('CPT4','ICD9Proc','CDT','HCPCS','ICD9CM','ICD10CM','ICD10PCS'))
15+
16+
#reading a single site data (for now)
17+
f<-'d:/OneDrive - National Institutes of Health/temp/dqd/export'
18+
f<-'d:/OneDrive - National Institutes of Health/ohdsi/thresholds'
19+
20+
sfiles<-c(file.path(f,'1ThresholdsA.csv'))
21+
sfiles<-c(file.path(f,'test-ThresholdsA.csv'))
22+
sfiles<-c(file.path(f,'1ThresholdsA.csv'),file.path(f,'ThresholdsA.csv'),file.path(f,'test-ThresholdsA.csv'))
23+
#3 sites processing +1
24+
sfiles<-c(file.path(f,'01ThresholdsB.csv')
25+
,file.path(f,'02ThresholdsB.csv')
26+
,file.path(f,'03ThresholdsB.csv')
27+
,file.path(f,'04ThresholdsB.csv')
28+
,file.path(f,'05ThresholdsB.csv')
29+
)
30+
ll<-map(sfiles,read_csv)
31+
ll
32+
33+
#ll<-map(p$pid,doProperty())
34+
#strip name from full path trick
35+
36+
#make lowercase the column names
37+
llmoded<-map(ll,~{names(.x)<-tolower(names(.x));return(.x)})
38+
#llmoded[[1]]
39+
#ll[[1]]
40+
ll2<-map2(llmoded,basename(sfiles),~mutate(.x,site=.y))
41+
d<-bind_rows(ll2)
42+
43+
#add terminology concepts
44+
sconcept<-concept %>% select(concept_id,concept_name)
45+
names(d) <- tolower(names(d))
46+
names(d)
47+
#remove no units rows and expand the CIDs
48+
#stratum hav suffix id
49+
# d2<-d %>% filter(stratum_1 != 0) %>% filter(stratum_2 != 0) %>% left_join(sconcept,by=c('stratum_1'='concept_id')) %>%
50+
# left_join(sconcept,by=c('stratum_2'='concept_id'))
51+
52+
53+
d2<-d %>% filter(count_value >=11 ) %>% filter(stratum1_id != 0) %>% filter(stratum2_id != 0) %>% left_join(sconcept,by=c('stratum1_id'='concept_id')) %>%
54+
left_join(sconcept,by=c('stratum2_id'='concept_id')) %>% filter(!is.na(concept_name.x))
55+
#test in 2B range are excluded by last filter
56+
57+
names(d2)
58+
59+
#overview of sites
60+
soverview<-d2 %>% count(site)
61+
soverview
62+
#soverview %>% write_csv('extras/DqdResults/S1_overview.csv')
63+
64+
#remove columns that are not needed
65+
# d3<-d2 %>% select(-stratum_3,-stratum_4,-stratum_5,-p25_value,-p75_value) %>%
66+
# filter(count_value >=100 ) %>% arrange(stratum_1,desc(count_value) )
67+
68+
d3<-d2 %>% select(-stratum_3,-stratum_4,-stratum_5) %>%
69+
arrange(stratum_1,desc(count_value) )
70+
71+
72+
#d3 %>% count(site)
73+
#names(d3)
74+
ba<-d2 %>% group_by(stratum1_id,stratum2_id,concept_name.x,concept_name.y) %>% summarize(tcnt=sum(count_value),n=n())
75+
ba %>% filter(n>=2) %>% nrow()
76+
nrow(ba)
77+
#4465 distinct test-unit pairs
78+
#872 test-unit paris have 2+ sites
79+
80+
81+
82+
#tests with more units
83+
ba %>% ungroup() %>% count(stratum1_id,concept_name.x) %>% filter(n>=2)
84+
#TODO improve later
85+
#868 tests have 2+ units
86+
87+
#only where multiple sites
88+
#d10<-d3 %>% inner_join(ba %>% filter(n>=2))
89+
90+
91+
92+
93+
94+
#even more removal of data
95+
#d4<-d3 %>% select(-count_value,-median_value,-stdev_value,-avg_value,-site)
96+
97+
#d4 %>% write_csv('extras/DqdResults/thresholds-list-A.csv')
98+
#nrow(d4)
99+
100+
101+
#end of analysis of
102+
103+
104+
#---------------comparison with expert driven
105+
106+
#read expert driven checks
107+
library(stats);library(tidyverse);library(magrittr)
108+
#message("\n*** Successfully loaded .Rprofile ***\n")
109+
110+
111+
url='https://raw.githubusercontent.com/OHDSI/DataQualityDashboard/master/inst/csv/OMOP_CDMv5.3.1_Concept_Level.csv'
112+
dqd<-read_csv(url)
113+
str(dqd)
114+
names(dqd)
115+
dqd %>% dplyr::filter(cdmTableName=='MEASUREMENT' & cmdFieldName=='MEASUREMENT_CONCEPT_ID' )
116+
dqd %>% dplyr::filter(cdmFieldName=='MEASUREMENT_CONCEPT_ID' ) %>% nrow()
117+
dqd %>% count(cdmTableName,cdmFieldName)
118+
119+
120+
#compare data driven and expert drive sets
121+
#d$STRATUM_1 %<>% as.integer()
122+
dqd$unitConceptId %<>% as.integer()
123+
expert <-dqd %>% dplyr::filter(cdmFieldName=='MEASUREMENT_CONCEPT_ID' )
124+
nrow(expert)
125+
#856 threshold checks are in expert driven KB
126+
names(expert)
127+
elabs<-expert %>% group_by(conceptId,conceptName) %>% summarise(unitcnt=n(),units=paste(unitConceptName,collapse = "|"))
128+
129+
# for 330 distinct lab tests
130+
elabs %>% write_csv('extras/DqdResults/DQD-expert-driven-A-lab-list.csv')
131+
132+
names(expert)
133+
#ddriven<-d %>% rename(conceptId=STRATUM_1,unitConceptId=STRATUM_2) %>% select(conceptId,unitConceptId) %>% unique()
134+
135+
names(d2)
136+
#ddriven<-d %>% rename(conceptId=STRATUM_1,unitConceptId=STRATUM_2)
137+
ddriven<-d2 %>% rename(conceptId=stratum1_id,unitConceptId=stratum2_id)
138+
ddriven2<-ba %>% rename(conceptId=stratum1_id,unitConceptId=stratum2_id)
139+
140+
#ddriven %<>% filter(conceptId!=0)
141+
#ddriven %<>% filter(unitConceptId!=0)
142+
143+
over=expert %>% inner_join(ddriven2)
144+
nrow(over)
145+
#331 tests are overlapping between ddriven (data driven) and expert (expert driven)
146+
#View(over)
147+
148+
not2<-expert %>% anti_join(ddriven2)
149+
nrow(not2)
150+
#525 are in expert list but not in data from any site
151+
152+
not1<-ddriven2 %>% anti_join(expert)
153+
nrow(not1)
154+
#4134 are in data but are absent in expert driven KB
155+
156+
157+
#compare the trehsholds
158+
names(over)
159+
over %>% select(conceptName,unitConceptName,plausibleValueLow,min_value)
160+
over %>% select(conceptName,unitConceptName,plausibleValueHigh,max_value)
161+
#%>% knitr::kable()
162+
163+
164+
#expert thresholds don't follow unit conversion logic (max and min is same even if units indicate order of magniture difference)
165+
#MEASUREMENT MEASUREMENT_CONCEPT_ID 3013721 Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma 8713 gram per deciliter 5 5 2000 5 NA NA NA NA NA NA NA NA
166+
#MEASUREMENT MEASUREMENT_CONCEPT_ID 3013721 Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma 8840 milligram per deciliter 5 5 2000
167+
168+
#5g/dL into mg/dL (is 5000 mg/dL)
169+
#in data is in fact unit/L
170+
171+
172+
173+
#unitmorph
174+
#Protein [Mass/volume] in Serum or Plasma 7096851 4 gram per deciliter|unit|milligram per deciliter|gram per liter
175+
# gram per deciliter| |milligram per deciliter | gram per liter
176+
names(d3)
177+
bb<-d3 %>% filter(site=='ThresholdsA.csv') %>% group_by(stratum_1,concept_name.x) %>%
178+
summarize(tcnt=sum(count_value)
179+
,n=n(),units=paste(concept_name.y,collapse = '|')
180+
,cnts=paste(count_value ,collapse = '|')
181+
,unitcids=paste(stratum_2,collapse = '|')
182+
)
183+
bb %>% write_csv('local/morphA.csv')

inst/dqd/readme.txt

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,23 @@ This file describes individual files included in the DataQuality export folder/e
22

33

44

5+
ThresholdsB.csv
6+
===============
7+
8+
This file is using custom SQL code (not achilles, but inspired by it) and has in addition percentiles 1,2,3,97,98,99
9+
10+
11+
12+
513
SuitableMeasurementsAndUnits.csv
614
===================================
715

816
This file has the suitable pairs of measurements and units.
9-
17+
(not used in latest version)
1018

1119

1220
ThresholdsA.csv
1321
===============
1422

1523
This file is using Achilles measure 1815 and has percentile 10 and 90
16-
17-
18-
ThresholdsB.csv
19-
===============
20-
21-
This file is using custom SQL code (not achilles, but inspired by it) and has in addition percentiles 1,2,3,97,98,99
24+
(not used in latest version)

0 commit comments

Comments
 (0)