CPDatManScripts/collect_scjohn_fuse_data.R at master · HumanExposure/CPDatManScripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
## Name: collect_scjohn_fuse_data.R
## Author: Katherine A. Phillips
## Date Created: April 2015
## Purpose: Collect and parse HTML data for SC Johnson's Ingredient/Use database.


library(XML)

##----------------------------------------------------------------------------##
## This finds all the links on the sc johnson webpage, don't really need this ##
## now since I saved the hrefs to a file                                      ##
##----------------------------------------------------------------------------##

## Base URL
scjurl <- "http://www.whatsinsidescjohnson.com/en-us/ingredients.aspx"

## Parse the HTML from the URL in as way to make it human-searchable
doc <- htmlParse(scjurl)

## Get all of the links out of the parsed HTML
links <- xpathSApply(doc,"//a/@href")

## Release the URL so you're not holding up the servers
free(doc)

## Save the links as an array
links <- as.vector(links)


##----------------------------------------------------------------------------##
## This loop downloads all the URLs into the pwd. Only run IFF you do not     ##
## have the HTML files saved -- this loop takes FOREVER!                      ##
##----------------------------------------------------------------------------##

## Root URL for downloading HTML files
scjurl <- "http://www.whatsinsidescjohnson.com"

## Store all children URLs as vector
urls <- readLines("scjohnson_indredients_hrefs.txt",warn=FALSE)

## Loop over all URLs -- I know, you're not supposed to loop in R, but it's so easy!
for (i in 1:length(urls)){

    ## Make full URL
    ChemURL <- paste(scjurl,urls[i],sep='')

    ## Create a HTML file name
    ChemFileName <- gsub('-','_',gsub("aspx","html",basename(ChemURL)))

    ## Download the file
    ChemFile <- download.file(url=ChemURL,destfile=ChemFileName)

    ## Don't overload the server
    Sys.sleep(10)
}


##----------------------------------------------------------------------------##
## This loop uses the downloaded HTML files to create a data frame, which     ##
## contains chemical names, uses, and the corresponding file                  ##
##----------------------------------------------------------------------------##

## Root URL for downloading HTML files
scjurl <- "http://www.whatsinsidescjohnson.com"

## Store all children URLs as vector
urls <- readLines("scjohnson_indredients_hrefs.txt",warn=FALSE)

## Initialize empty data frame
SCJChem <- data.frame()

## Loop over all the URLs
for (i in 1:length(urls)){

    ## Create the HTML file name
    ChemFileName <- gsub('-','_',gsub("aspx","html",basename(urls[i])))

    ## Parse chemical HTML file
    ChemDoc <- xmlParse(ChemFileName,isHTML=TRUE)

    ## Extract chemical name from HTML file
    ChemName <- xmlValue(xpathSApply(ChemDoc,"//h3")[[3]])

    ## Extract chemical descriptions from HTML file
    ChemUse <- xmlValue(xpathSApply(ChemDoc,"//p")[[2]])
    ChemUse <- gsub(","," ",ChemUse)

    ## Make full URL
    ChemURL <- paste(scjurl,urls[i],sep='')

    ## Store name, description, and file location in data frame
    SCJChem <- rbind(SCJChem,t(c(ChemName,ChemUse,ChemURL)))
}

## Rename data frame columns
colnames(SCJChem)[1] <- "ChemicalName"
colnames(SCJChem)[2] <- "UseCategory"
colnames(SCJChem)[3] <- "File"

## Save information to csv file
write.csv(SCJChem,"SCJohnsonChemicalIngredients_RAW.csv",quote=FALSE,row.names=FALSE,col.names=TRUE)