|
| 1 | +#compare master version of MEASUREMENTS and beckybanbury's version just after she ran ID_sets_of_duplicate_records.R. This code is to make sure that overwritting D.precedence (for now, and other duplicated related columns for later) happened only where it was meant to happen. |
| 2 | + |
| 3 | +library(RCurl) |
| 4 | + |
| 5 | +meas.becky <- read.csv(text = getURL("https://raw.githubusercontent.com/forc-db/ForC/beckybanbury/data/ForC_measurements.csv"), stringsAsFactors = F) |
| 6 | +meas.master <- read.csv(text = getURL("https://raw.githubusercontent.com/forc-db/ForC/master/data/ForC_measurements.csv"), stringsAsFactors = F) |
| 7 | + |
| 8 | + |
| 9 | +duplicate.related.columns <- c("measurement.ID","conflicts", "R.group", "S.group", "D.group", |
| 10 | + "D.precedence", "conflict.type", "D.precedence.measurement.ID", |
| 11 | + "conflicts.notes", "checked.ori.pub", "loaded.by", "flag.suspicious") |
| 12 | + |
| 13 | +all(meas.becky$measurement.ID == meas.master$measurement.ID) # has yo be TRUE to move forward |
| 14 | + |
| 15 | +original.duplicate.related.column.values <- meas.master[, duplicate.related.columns] |
| 16 | +new.duplicate.related.column.values <- meas.becky[, duplicate.related.columns] |
| 17 | + |
| 18 | + |
| 19 | +not.the.same.ones <- which(apply(original.duplicate.related.column.values == new.duplicate.related.column.values, 1, function(x) any(!x[!is.na(x)]))) |
| 20 | + |
| 21 | +what.not.the.same.ones <- apply(original.duplicate.related.column.values == new.duplicate.related.column.values, 1, function(x) names(x)[!x & !is.na(x)]) |
| 22 | +table(unlist(what.not.the.same.ones[not.the.same.ones])) |
| 23 | + |
| 24 | +what.not.the.same.ones.D.precedence <- apply(original.duplicate.related.column.values == new.duplicate.related.column.values, 1, function(x) "D.precedence" %in% names(x)[!x & !is.na(x)]) |
| 25 | + |
| 26 | + |
| 27 | +table(original.duplicate.related.column.values[what.not.the.same.ones.D.precedence,]$loaded.by) # different D.precendence with newer entries is normal. |
| 28 | + |
| 29 | +#lets look at D.groups that involve older data: #### |
| 30 | + |
| 31 | +original.duplicate.related.column.values[what.not.the.same.ones.D.precedence & original.duplicate.related.column.values$loaded.by %in% c("Kristina J. Anderson-Teixeira" , "Maria M. H. Wang" ),]$D.group |
| 32 | +new.duplicate.related.column.values[what.not.the.same.ones.D.precedence & original.duplicate.related.column.values$loaded.by %in% c("Kristina J. Anderson-Teixeira" , "Maria M. H. Wang" ),]$D.group |
| 33 | + |
| 34 | +## D.group 432 and others related to it #### |
| 35 | +X.group = 432 |
| 36 | +pattern.X.group <- paste0("(^",X.group, "$)|(^",X.group, ";)|(;", X.group, ";)|(;", X.group, "$)") |
| 37 | +original.duplicate.related.column.values[grepl(pattern.X.group, original.duplicate.related.column.values$D.group), ] |
| 38 | +new.duplicate.related.column.values[grepl(pattern.X.group, new.duplicate.related.column.values$D.group), ] |
| 39 | + |
| 40 | +X.group = 1411 |
| 41 | +pattern.X.group <- paste0("(^",X.group, "$)|(^",X.group, ";)|(;", X.group, ";)|(;", X.group, "$)") |
| 42 | +new.duplicate.related.column.values[grepl(pattern.X.group, new.duplicate.related.column.values$D.group), ] |
| 43 | +View(meas.becky[grepl(pattern.X.group, meas.becky$D.group), ]) |
| 44 | + |
| 45 | +X.group1 = 432; X.group2= 1411; X.group3 = 1421; X.group4 = 1422; X.group5 = 1423; X.group6 = 1424; X.group7 = 1426; X.group8 = 1427; X.group9 = 1428; X.group10 = 14219 |
| 46 | +all.X.group <- unlist(mget(paste0("X.group", 1:10))) |
| 47 | +pattern.X.group <- paste0(paste0("(^",all.X.group, "$)|(^",all.X.group, ";)|(;", all.X.group, ";)|(;", all.X.group, "$)"), collapse = "|") |
| 48 | +new.duplicate.related.column.values[grepl(pattern.X.group, new.duplicate.related.column.values$D.group), ] |
| 49 | +View(meas.becky[grepl(pattern.X.group, meas.becky$D.group), ]) |
| 50 | + |
| 51 | +# D.group 433 and others related to it #### |
| 52 | +X.group = 433 |
| 53 | +pattern.X.group <- paste0("(^",X.group, "$)|(^",X.group, ";)|(;", X.group, ";)|(;", X.group, "$)") |
| 54 | +original.duplicate.related.column.values[grepl(pattern.X.group, original.duplicate.related.column.values$D.group), ] |
| 55 | +new.duplicate.related.column.values[grepl(pattern.X.group, new.duplicate.related.column.values$D.group), ] |
| 56 | + |
| 57 | +new.duplicate.related.column.values[new.duplicate.related.column.values$measurement.ID %in% original.duplicate.related.column.values[grepl(pattern.X.group, original.duplicate.related.column.values$D.group), ]$measurement.ID,] # group 433 has been replaced by group 1411 because of new data from Becky Banbury Morgan (Beckybanbury) |
| 58 | + |
| 59 | +X.group = 1411 |
| 60 | +pattern.X.group <- paste0("(^",X.group, "$)|(^",X.group, ";)|(;", X.group, ";)|(;", X.group, "$)") |
| 61 | +View(meas.becky[grepl(pattern.X.group, meas.becky$D.group), ]) |
| 62 | + |
| 63 | + |
| 64 | + |
| 65 | +# D.group 489 and others related to it #### |
| 66 | +X.group = 489 |
| 67 | +pattern.X.group <- paste0("(^",X.group, "$)|(^",X.group, ";)|(;", X.group, ";)|(;", X.group, "$)") |
| 68 | +original.duplicate.related.column.values[grepl(pattern.X.group, original.duplicate.related.column.values$D.group), ] |
| 69 | +new.duplicate.related.column.values[grepl(pattern.X.group, new.duplicate.related.column.values$D.group), ]# group 490 has been replaced by group 1616 because of new data from Becky Banbury Morgan (Beckybanbury) |
| 70 | + |
| 71 | +X.group = 1616 |
| 72 | +pattern.X.group <- paste0("(^",X.group, "$)|(^",X.group, ";)|(;", X.group, ";)|(;", X.group, "$)") |
| 73 | +View(meas.becky[grepl(pattern.X.group, meas.becky$D.group), ]) |
| 74 | + |
| 75 | + |
| 76 | + |
| 77 | +X.group1 = 489; X.group2= 1616; X.group3 = 1625; X.group4 = 1629 |
| 78 | +all.X.group <- unlist(mget(paste0("X.group", 1:4))) |
| 79 | +pattern.X.group <- paste0(paste0("(^",all.X.group, "$)|(^",all.X.group, ";)|(;", all.X.group, ";)|(;", all.X.group, "$)"), collapse = "|") |
| 80 | +new.duplicate.related.column.values[grepl(pattern.X.group, new.duplicate.related.column.values$D.group), ] |
| 81 | +View(meas.becky[grepl(pattern.X.group, meas.becky$D.group), ]) |
| 82 | + |
| 83 | +# D.group 475 and others related to it #### |
| 84 | +X.group = 475 |
| 85 | +pattern.X.group <- paste0("(^",X.group, "$)|(^",X.group, ";)|(;", X.group, ";)|(;", X.group, "$)") |
| 86 | +original.duplicate.related.column.values[grepl(pattern.X.group, original.duplicate.related.column.values$D.group), ] |
| 87 | +new.duplicate.related.column.values[grepl(pattern.X.group, new.duplicate.related.column.values$D.group), ]# group 476 has been replaced by group 1577 because of new data from Becky Banbury Morgan (Beckybanbury) |
| 88 | + |
| 89 | +X.group = 1577 |
| 90 | +pattern.X.group <- paste0("(^",X.group, "$)|(^",X.group, ";)|(;", X.group, ";)|(;", X.group, "$)") |
| 91 | +View(meas.becky[grepl(pattern.X.group, meas.becky$D.group), ]) |
| 92 | + |
| 93 | + |
| 94 | + |
| 95 | +X.group1 = 475; X.group2= 1577; X.group3 = 1585; X.group4 = 1586; X.group5 = 1587; X.group6 = 1588; X.group7 = 1589; X.group8 = 1590; X.group9 = 1591; X.group10 = 1592; X.group11 = 1593 |
| 96 | +all.X.group <- unlist(mget(paste0("X.group", 1:11))) |
| 97 | +pattern.X.group <- paste0(paste0("(^",all.X.group, "$)|(^",all.X.group, ";)|(;", all.X.group, ";)|(;", all.X.group, "$)"), collapse = "|") |
| 98 | +new.duplicate.related.column.values[grepl(pattern.X.group, new.duplicate.related.column.values$D.group), ] |
| 99 | +View(meas.becky[grepl(pattern.X.group, meas.becky$D.group), ]) |
| 100 | + |
| 101 | + |
| 102 | +# D.group NAC and others related to it #### |
| 103 | +View(original.duplicate.related.column.values[what.not.the.same.ones.D.precedence & original.duplicate.related.column.values$loaded.by %in% c("Kristina J. Anderson-Teixeira" , "Maria M. H. Wang")& original.duplicate.related.column.values$D.group %in% "NAC",]) |
| 104 | +View(meas.becky[what.not.the.same.ones.D.precedence & original.duplicate.related.column.values$loaded.by %in% c("Kristina J. Anderson-Teixeira" , "Maria M. H. Wang" ) & original.duplicate.related.column.values$D.group %in% "NAC",]) |
| 105 | + |
| 106 | + |
| 107 | +save(MEASUREMENTS.split, file = "scripts/z_archive/MEASUREMENTS.split.Rdata") |
| 108 | +load("scripts/z_archive/MEASUREMENTS.split.Rdata") |
0 commit comments