-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path1_DataCleaning.R
More file actions
63 lines (52 loc) · 2.35 KB
/
1_DataCleaning.R
File metadata and controls
63 lines (52 loc) · 2.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# Load necessary packages
#install.packages("dplyr")
library(dplyr) # Version 1.1.4
# Load VIP datasets
vips_mtDNA <- read.csv("~/VIP_Haplogroups/Raw_Data/vipsMtDNA.csv")
colnames(vips_mtDNA)[2] <- "mtDNA.haplogroup"
vips_yDNA <- read.csv("~/VIP_Haplogroups/Raw_Data/vipsYDNA.csv")[, -c(4, 5)]
colnames(vips_yDNA)[2] <- "Y.haplogroup"
# Load AADR dataset
AADR <- read.csv("~/VIP_Haplogroups/Raw_Data/AADR_Annotation.csv")[, -c(1,4,5,6,7,8,10,11,12,18,19,20,
21,22,23,24,25,26,28,30,31,32,
33,34,35,36)]
# Fix column names
colnames(AADR)[3] = "Date"
colnames(AADR)[9] = "Y.haplogroup"
colnames(AADR)[10] = "mtDNA.haplogroup"
# Fix date column to reflect true year (if it's negative it's BCE, if it's positive it's CE)
AADR$Date <- 1950 - AADR$Date
# Split into 2 AADR tables: one with mtDNA and one with yDNA
AADR_mtDNA <- data.table::copy(AADR)
AADR_mtDNA <- AADR_mtDNA %>%
select(-Y.haplogroup) %>%
# Keep only entries with mtDNA
filter(grepl("^[A-Za-z0-9]+$", mtDNA.haplogroup))
colnames(AADR_mtDNA)[6] <- "Origin"
AADR_yDNA <- AADR %>%
select(-mtDNA.haplogroup) %>%
# Keep only entries with Y
filter(grepl("^[A-Za-z0-9]+$", Y.haplogroup))
colnames(AADR_yDNA)[6] <- "Origin"
# For ease of joining datasets later, rename all haplogroup columns to "haplogroup"
colnames(vips_mtDNA)[2] <- "haplogroup"
colnames(vips_yDNA)[2] <- "haplogroup"
colnames(AADR_mtDNA)[9] <- "haplogroup"
colnames(AADR_yDNA)[9] <- "haplogroup"
# Load Parent-Leaf Pairs
mtPhylo <- read.csv("~/VIP_Haplogroups/Raw_Data/mt_phyloTree.csv", quote = "")
colnames(mtPhylo)[1] <- "haplogroup"
colnames(mtPhylo)[2] <- "Parent"
yPhylo <- read.csv("~/VIP_Haplogroups/Raw_Data/y_phyloTree.csv")
colnames(yPhylo)[1] <- "haplogroup" # needs to be named haplogroup for now for the left_join
colnames(yPhylo)[2] <- "Parent"
# Remove vips whose haplogroup was not found in the y tree
merged_y_tree <- left_join(vips_yDNA, yPhylo, by = "haplogroup")
vips_yDNA <- merged_y_tree %>%
filter(Parent != "NA") %>%
select(-Parent)
# Rename column 1 to "Child" in the phylo files
# They will need to be named this when I use the data.tree package later as this is the
# only way the program will recognize these values as branches to build a tree
colnames(mtPhylo)[1] <- "Child"
colnames(yPhylo)[1] <- "Child"