Skip to content

Commit d04dfa8

Browse files
committed
upload
1 parent 245719a commit d04dfa8

File tree

1 file changed

+108
-0
lines changed

1 file changed

+108
-0
lines changed
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
#compare master version of MEASUREMENTS and beckybanbury's version just after she ran ID_sets_of_duplicate_records.R. This code is to make sure that overwritting D.precedence (for now, and other duplicated related columns for later) happened only where it was meant to happen.
2+
3+
library(RCurl)
4+
5+
meas.becky <- read.csv(text = getURL("https://raw.githubusercontent.com/forc-db/ForC/beckybanbury/data/ForC_measurements.csv"), stringsAsFactors = F)
6+
meas.master <- read.csv(text = getURL("https://raw.githubusercontent.com/forc-db/ForC/master/data/ForC_measurements.csv"), stringsAsFactors = F)
7+
8+
9+
duplicate.related.columns <- c("measurement.ID","conflicts", "R.group", "S.group", "D.group",
10+
"D.precedence", "conflict.type", "D.precedence.measurement.ID",
11+
"conflicts.notes", "checked.ori.pub", "loaded.by", "flag.suspicious")
12+
13+
all(meas.becky$measurement.ID == meas.master$measurement.ID) # has yo be TRUE to move forward
14+
15+
original.duplicate.related.column.values <- meas.master[, duplicate.related.columns]
16+
new.duplicate.related.column.values <- meas.becky[, duplicate.related.columns]
17+
18+
19+
not.the.same.ones <- which(apply(original.duplicate.related.column.values == new.duplicate.related.column.values, 1, function(x) any(!x[!is.na(x)])))
20+
21+
what.not.the.same.ones <- apply(original.duplicate.related.column.values == new.duplicate.related.column.values, 1, function(x) names(x)[!x & !is.na(x)])
22+
table(unlist(what.not.the.same.ones[not.the.same.ones]))
23+
24+
what.not.the.same.ones.D.precedence <- apply(original.duplicate.related.column.values == new.duplicate.related.column.values, 1, function(x) "D.precedence" %in% names(x)[!x & !is.na(x)])
25+
26+
27+
table(original.duplicate.related.column.values[what.not.the.same.ones.D.precedence,]$loaded.by) # different D.precendence with newer entries is normal.
28+
29+
#lets look at D.groups that involve older data: ####
30+
31+
original.duplicate.related.column.values[what.not.the.same.ones.D.precedence & original.duplicate.related.column.values$loaded.by %in% c("Kristina J. Anderson-Teixeira" , "Maria M. H. Wang" ),]$D.group
32+
new.duplicate.related.column.values[what.not.the.same.ones.D.precedence & original.duplicate.related.column.values$loaded.by %in% c("Kristina J. Anderson-Teixeira" , "Maria M. H. Wang" ),]$D.group
33+
34+
## D.group 432 and others related to it ####
35+
X.group = 432
36+
pattern.X.group <- paste0("(^",X.group, "$)|(^",X.group, ";)|(;", X.group, ";)|(;", X.group, "$)")
37+
original.duplicate.related.column.values[grepl(pattern.X.group, original.duplicate.related.column.values$D.group), ]
38+
new.duplicate.related.column.values[grepl(pattern.X.group, new.duplicate.related.column.values$D.group), ]
39+
40+
X.group = 1411
41+
pattern.X.group <- paste0("(^",X.group, "$)|(^",X.group, ";)|(;", X.group, ";)|(;", X.group, "$)")
42+
new.duplicate.related.column.values[grepl(pattern.X.group, new.duplicate.related.column.values$D.group), ]
43+
View(meas.becky[grepl(pattern.X.group, meas.becky$D.group), ])
44+
45+
X.group1 = 432; X.group2= 1411; X.group3 = 1421; X.group4 = 1422; X.group5 = 1423; X.group6 = 1424; X.group7 = 1426; X.group8 = 1427; X.group9 = 1428; X.group10 = 14219
46+
all.X.group <- unlist(mget(paste0("X.group", 1:10)))
47+
pattern.X.group <- paste0(paste0("(^",all.X.group, "$)|(^",all.X.group, ";)|(;", all.X.group, ";)|(;", all.X.group, "$)"), collapse = "|")
48+
new.duplicate.related.column.values[grepl(pattern.X.group, new.duplicate.related.column.values$D.group), ]
49+
View(meas.becky[grepl(pattern.X.group, meas.becky$D.group), ])
50+
51+
# D.group 433 and others related to it ####
52+
X.group = 433
53+
pattern.X.group <- paste0("(^",X.group, "$)|(^",X.group, ";)|(;", X.group, ";)|(;", X.group, "$)")
54+
original.duplicate.related.column.values[grepl(pattern.X.group, original.duplicate.related.column.values$D.group), ]
55+
new.duplicate.related.column.values[grepl(pattern.X.group, new.duplicate.related.column.values$D.group), ]
56+
57+
new.duplicate.related.column.values[new.duplicate.related.column.values$measurement.ID %in% original.duplicate.related.column.values[grepl(pattern.X.group, original.duplicate.related.column.values$D.group), ]$measurement.ID,] # group 433 has been replaced by group 1411 because of new data from Becky Banbury Morgan (Beckybanbury)
58+
59+
X.group = 1411
60+
pattern.X.group <- paste0("(^",X.group, "$)|(^",X.group, ";)|(;", X.group, ";)|(;", X.group, "$)")
61+
View(meas.becky[grepl(pattern.X.group, meas.becky$D.group), ])
62+
63+
64+
65+
# D.group 489 and others related to it ####
66+
X.group = 489
67+
pattern.X.group <- paste0("(^",X.group, "$)|(^",X.group, ";)|(;", X.group, ";)|(;", X.group, "$)")
68+
original.duplicate.related.column.values[grepl(pattern.X.group, original.duplicate.related.column.values$D.group), ]
69+
new.duplicate.related.column.values[grepl(pattern.X.group, new.duplicate.related.column.values$D.group), ]# group 490 has been replaced by group 1616 because of new data from Becky Banbury Morgan (Beckybanbury)
70+
71+
X.group = 1616
72+
pattern.X.group <- paste0("(^",X.group, "$)|(^",X.group, ";)|(;", X.group, ";)|(;", X.group, "$)")
73+
View(meas.becky[grepl(pattern.X.group, meas.becky$D.group), ])
74+
75+
76+
77+
X.group1 = 489; X.group2= 1616; X.group3 = 1625; X.group4 = 1629
78+
all.X.group <- unlist(mget(paste0("X.group", 1:4)))
79+
pattern.X.group <- paste0(paste0("(^",all.X.group, "$)|(^",all.X.group, ";)|(;", all.X.group, ";)|(;", all.X.group, "$)"), collapse = "|")
80+
new.duplicate.related.column.values[grepl(pattern.X.group, new.duplicate.related.column.values$D.group), ]
81+
View(meas.becky[grepl(pattern.X.group, meas.becky$D.group), ])
82+
83+
# D.group 475 and others related to it ####
84+
X.group = 475
85+
pattern.X.group <- paste0("(^",X.group, "$)|(^",X.group, ";)|(;", X.group, ";)|(;", X.group, "$)")
86+
original.duplicate.related.column.values[grepl(pattern.X.group, original.duplicate.related.column.values$D.group), ]
87+
new.duplicate.related.column.values[grepl(pattern.X.group, new.duplicate.related.column.values$D.group), ]# group 476 has been replaced by group 1577 because of new data from Becky Banbury Morgan (Beckybanbury)
88+
89+
X.group = 1577
90+
pattern.X.group <- paste0("(^",X.group, "$)|(^",X.group, ";)|(;", X.group, ";)|(;", X.group, "$)")
91+
View(meas.becky[grepl(pattern.X.group, meas.becky$D.group), ])
92+
93+
94+
95+
X.group1 = 475; X.group2= 1577; X.group3 = 1585; X.group4 = 1586; X.group5 = 1587; X.group6 = 1588; X.group7 = 1589; X.group8 = 1590; X.group9 = 1591; X.group10 = 1592; X.group11 = 1593
96+
all.X.group <- unlist(mget(paste0("X.group", 1:11)))
97+
pattern.X.group <- paste0(paste0("(^",all.X.group, "$)|(^",all.X.group, ";)|(;", all.X.group, ";)|(;", all.X.group, "$)"), collapse = "|")
98+
new.duplicate.related.column.values[grepl(pattern.X.group, new.duplicate.related.column.values$D.group), ]
99+
View(meas.becky[grepl(pattern.X.group, meas.becky$D.group), ])
100+
101+
102+
# D.group NAC and others related to it ####
103+
View(original.duplicate.related.column.values[what.not.the.same.ones.D.precedence & original.duplicate.related.column.values$loaded.by %in% c("Kristina J. Anderson-Teixeira" , "Maria M. H. Wang")& original.duplicate.related.column.values$D.group %in% "NAC",])
104+
View(meas.becky[what.not.the.same.ones.D.precedence & original.duplicate.related.column.values$loaded.by %in% c("Kristina J. Anderson-Teixeira" , "Maria M. H. Wang" ) & original.duplicate.related.column.values$D.group %in% "NAC",])
105+
106+
107+
save(MEASUREMENTS.split, file = "scripts/z_archive/MEASUREMENTS.split.Rdata")
108+
load("scripts/z_archive/MEASUREMENTS.split.Rdata")

0 commit comments

Comments
 (0)