Skip to content

Commit 522cd65

Browse files
committed
git pushMerge branch 'master' of github.com:USCDataScience/sparkler
2 parents 2369618 + cc0a8e5 commit 522cd65

27 files changed

+348
-236
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ To use sparkler, install [docker](https://www.docker.com/community-edition#/down
2323

2424
```bash
2525
# Step 0. Get this script
26-
wget https://raw.githubusercontent.com/USCDataScience/sparkler/master/bin/dockler.sh
26+
wget https://raw.githubusercontent.com/USCDataScience/sparkler/master/sparkler-core/bin/dockler.sh
2727
# Step 1. Run the script - it starts docker container and forwards ports to host
2828
bash dockler.sh
2929
# Step 2. Inject seed urls

sparkler-core/conf/sparkler-default.yaml

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,20 @@
1717

1818
##################### General Properties ################################
1919

20-
# Crawl Database URL. Stores crawl metadata and status updates.
21-
# Type: String. Default: http://localhost:8983/solr/crawldb
22-
# for standalone server
23-
crawldb.uri: http://localhost:8983/solr/crawldb
20+
# uri - Crawl Database URL. Stores crawl metadata and status updates.
2421

25-
# for quick test crawls using embedded solr
26-
#crawldb.uri: file://conf/solr/crawldb
22+
crawldb.backend: solr # "solr" is default until "elasticsearch" becomes usable.
2723

28-
# for cloudmode with zookeepers; Format = collectionName::zkhost1:port1,zkhost2:port2,zkhost3:port3
29-
# crawldb.uri: crawldb::localhost:9983
24+
# Type: String. Default: http://localhost:8983/solr/crawldb
25+
# for standalone server
26+
# For quick test crawls using embedded solr
27+
# solr.uri: file://conf/solr/crawldb
28+
# For cloudmode with zookeepers; Format = collectionName::zkhost1:port1,zkhost2:port2,zkhost3:port3
29+
# solr.uri: crawldb::localhost:9983
30+
solr.uri: http://localhost:8983/solr/crawldb
31+
32+
# elasticsearch settings
33+
elasticsearch.uri: http://localhost:9200
3034

3135

3236
##################### Apache Spark Properties ###########################

sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/Constants.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ interface key { //config key name
3838

3939
// General Properties
4040
@ConfigKey
41-
String CRAWLDB = "crawldb.uri";
41+
String CRAWLDB_BACKEND = "crawldb.backend";
4242

4343
// Apache Spark Properties
4444
@ConfigKey
@@ -122,7 +122,7 @@ interface file {
122122
String SPARKLER_DEFAULT = "sparkler-default.yaml";
123123
}
124124

125-
interface solr { // Solr Fields
125+
interface storage { // Storage Fields
126126
String ID = "id";
127127
String CRAWL_ID = "crawl_id";
128128
String URL = "url";

sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/SparklerConfiguration.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,4 +102,9 @@ private static Map deepMerge(Map original, Map newMap) {
102102
return original;
103103
}
104104

105+
public String getDatabaseURI() {
106+
String dbToUse = (String) this.getOrDefault(Constants.key.CRAWLDB_BACKEND, "solr"); // solr is default
107+
return (String) this.get(dbToUse+".uri");
108+
}
109+
105110
}

sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/solr/schema/FieldMapper.java renamed to sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/storage/solr/schema/FieldMapper.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* limitations under the License.
1616
*/
1717

18-
package edu.usc.irds.sparkler.solr.schema;
18+
package edu.usc.irds.sparkler.storage.solr.schema;
1919

2020
import org.json.simple.parser.ParseException;
2121
import org.slf4j.Logger;

sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/solr/schema/StringEvaluator.java renamed to sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/storage/solr/schema/StringEvaluator.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* limitations under the License.
1616
*/
1717

18-
package edu.usc.irds.sparkler.solr.schema;
18+
package edu.usc.irds.sparkler.storage.solr.schema;
1919

2020
import org.slf4j.Logger;
2121
import org.slf4j.LoggerFactory;

sparkler-core/sparkler-api/src/test/resources/sparkler-default.yaml

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,20 @@
1717

1818
##################### General Properties ################################
1919

20-
# Crawl Database URL. Stores crawl metadata and status updates.
21-
# Type: String. Default: http://localhost:8983/solr/crawldb
22-
crawldb.uri: file://conf/solr/crawldb
20+
# uri - Crawl Database URL. Stores crawl metadata and status updates.
21+
22+
crawldb.backend: solr # "solr" is default until "elasticsearch" becomes usable.
2323

24+
# Type: String. Default: http://localhost:8983/solr/crawldb
25+
# for standalone server
26+
# For quick test crawls using embedded solr
27+
# solr.uri: file://conf/solr/crawldb
28+
# For cloudmode with zookeepers; Format = collectionName::zkhost1:port1,zkhost2:port2,zkhost3:port3
29+
# solr.uri: crawldb::localhost:9983
30+
solr.uri: http://localhost:8983/solr/crawldb
31+
32+
# elasticsearch settings
33+
elasticsearch.uri: http://localhost:9200
2434

2535

2636
##################### Apache Spark Properties ###########################

sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/CrawlDbRDD.scala

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ package edu.usc.irds.sparkler
1919

2020
import edu.usc.irds.sparkler.base.Loggable
2121
import edu.usc.irds.sparkler.model.{Resource, ResourceStatus, SparklerJob}
22-
import edu.usc.irds.sparkler.solr.SolrGroupPartition
22+
import edu.usc.irds.sparkler.storage.solr.SolrGroupPartition
2323
import edu.usc.irds.sparkler.util.SolrResultIterator
2424
import org.apache.solr.client.solrj.SolrQuery
2525
import org.apache.solr.client.solrj.util.ClientUtils
@@ -50,27 +50,27 @@ class CrawlDbRDD(sc: SparkContext,
5050
val partition: SolrGroupPartition = split.asInstanceOf[SolrGroupPartition]
5151
val batchSize = 100
5252
val query = new SolrQuery(generateQry)
53-
query.addFilterQuery(s"""${Constants.solr.GROUP}:"${escapeQueryChars(partition.group)}"""")
54-
query.addFilterQuery(s"${Constants.solr.CRAWL_ID}:${job.id}")
53+
query.addFilterQuery(s"""${Constants.storage.GROUP}:"${escapeQueryChars(partition.group)}"""")
54+
query.addFilterQuery(s"${Constants.storage.CRAWL_ID}:${job.id}")
5555
query.set("sort", sortBy)
5656
query.setRows(batchSize)
5757

58-
new SolrResultIterator[Resource](job.newCrawlDbSolrClient().crawlDb, query,
58+
new SolrResultIterator[Resource](job.newStorageProxy().getClient(), query,
5959
batchSize, classOf[Resource], closeClient = true, limit = topN)
6060
}
6161

6262
override protected def getPartitions: Array[Partition] = {
6363
val qry = new SolrQuery(generateQry)
64-
qry.addFilterQuery(s"${Constants.solr.CRAWL_ID}:${job.id}")
64+
qry.addFilterQuery(s"${Constants.storage.CRAWL_ID}:${job.id}")
6565
qry.set("sort", sortBy)
6666
qry.set("group", true)
6767
qry.set("group.ngroups", true)
6868
qry.set("group.field", groupBy)
6969
qry.set("group.limit", 0)
7070
qry.setRows(maxGroups)
71-
val proxy = job.newCrawlDbSolrClient()
72-
val solr = proxy.crawlDb
73-
val groupRes = solr.query(qry).getGroupResponse.getValues.get(0)
71+
val proxy = job.newStorageProxy()
72+
val client = proxy.getClient()
73+
val groupRes = client.query(qry).getGroupResponse.getValues.get(0)
7474
val grps = groupRes.getValues
7575
CrawlDbRDD.LOG.info(s"selecting ${grps.size()} out of ${groupRes.getNGroups}")
7676
val res = new Array[Partition](grps.size())
@@ -86,8 +86,8 @@ class CrawlDbRDD(sc: SparkContext,
8686

8787
object CrawlDbRDD extends Loggable {
8888

89-
val DEFAULT_ORDER = Constants.solr.DISCOVER_DEPTH + " asc," + Constants.solr.SCORE + " asc"
90-
val DEFAULT_FILTER_QRY = Constants.solr.STATUS + ":" + ResourceStatus.UNFETCHED
89+
val DEFAULT_ORDER = Constants.storage.DISCOVER_DEPTH + " asc," + Constants.storage.SCORE + " asc"
90+
val DEFAULT_FILTER_QRY = Constants.storage.STATUS + ":" + ResourceStatus.UNFETCHED
9191
val DEFAULT_GROUPS = 1000
9292
val DEFAULT_TOPN = 1000
9393
val DEFAULT_GROUPBY = "group"

sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/MemexCrawlDbRDD.scala

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ package edu.usc.irds.sparkler
22

33
import edu.usc.irds.sparkler.base.Loggable
44
import edu.usc.irds.sparkler.model.{Resource, ResourceStatus, SparklerJob}
5-
import edu.usc.irds.sparkler.solr.SolrGroupPartition
5+
import edu.usc.irds.sparkler.storage.solr.SolrGroupPartition
66
import edu.usc.irds.sparkler.util.SolrResultIterator
77
import org.apache.solr.client.solrj.SolrQuery
88
import org.apache.solr.client.solrj.util.ClientUtils.escapeQueryChars
@@ -29,27 +29,27 @@ class MemexCrawlDbRDD(sc: SparkContext,
2929
val partition: SolrGroupPartition = split.asInstanceOf[SolrGroupPartition]
3030
val batchSize = 100
3131
val query = new SolrQuery(generateQry)
32-
query.addFilterQuery(s"""${Constants.solr.PARENT}:"${escapeQueryChars(partition.group)}"""")
33-
query.addFilterQuery(s"${Constants.solr.CRAWL_ID}:${job.id}")
32+
query.addFilterQuery(s"""${Constants.storage.PARENT}:"${escapeQueryChars(partition.group)}"""")
33+
query.addFilterQuery(s"${Constants.storage.CRAWL_ID}:${job.id}")
3434
query.set("sort", sortBy)
3535
query.setRows(batchSize)
3636

37-
new SolrResultIterator[Resource](job.newCrawlDbSolrClient().crawlDb, query,
37+
new SolrResultIterator[Resource](job.newStorageProxy().getClient(), query,
3838
batchSize, classOf[Resource], closeClient = true, limit = topN)
3939
}
4040

4141
override protected def getPartitions: Array[Partition] = {
4242
val qry = new SolrQuery(generateQry)
43-
qry.addFilterQuery(s"${Constants.solr.CRAWL_ID}:${job.id}")
43+
qry.addFilterQuery(s"${Constants.storage.CRAWL_ID}:${job.id}")
4444
qry.set("sort", sortBy)
4545
qry.set("group", true)
4646
qry.set("group.ngroups", true)
47-
qry.set("group.field", Constants.solr.PARENT)
47+
qry.set("group.field", Constants.storage.PARENT)
4848
qry.set("group.limit", 0)
4949
qry.setRows(maxGroups)
50-
val proxy = job.newCrawlDbSolrClient()
51-
val solr = proxy.crawlDb
52-
val groupRes = solr.query(qry).getGroupResponse.getValues.get(0)
50+
val proxy = job.newStorageProxy()
51+
val client = proxy.getClient()
52+
val groupRes = client.query(qry).getGroupResponse.getValues.get(0)
5353
val grps = groupRes.getValues
5454
MemexCrawlDbRDD.LOG.info(s"selecting ${grps.size()} out of ${groupRes.getNGroups}")
5555
val res = new Array[Partition](grps.size())
@@ -65,8 +65,8 @@ class MemexCrawlDbRDD(sc: SparkContext,
6565

6666
object MemexCrawlDbRDD extends Loggable {
6767

68-
val DEFAULT_ORDER = Constants.solr.DISCOVER_DEPTH + " asc," + Constants.solr.SCORE + " desc"
69-
val DEFAULT_FILTER_QRY = Constants.solr.STATUS + ":" + ResourceStatus.UNFETCHED
68+
val DEFAULT_ORDER = Constants.storage.DISCOVER_DEPTH + " asc," + Constants.storage.SCORE + " desc"
69+
val DEFAULT_FILTER_QRY = Constants.storage.STATUS + ":" + ResourceStatus.UNFETCHED
7070
val DEFAULT_GROUPS = 1000
7171
val DEFAULT_TOPN = 1000
7272
}

sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/MemexDeepCrawlRDD.scala

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import java.net.URL
44

55
import edu.usc.irds.sparkler.base.Loggable
66
import edu.usc.irds.sparkler.model.{Resource, ResourceStatus, SparklerJob}
7-
import edu.usc.irds.sparkler.solr.SolrGroupPartition
7+
import edu.usc.irds.sparkler.storage.solr.SolrGroupPartition
88
import edu.usc.irds.sparkler.util.SolrResultIterator
99
import org.apache.solr.client.solrj.SolrQuery
1010
import org.apache.solr.client.solrj.util.ClientUtils.escapeQueryChars
@@ -38,28 +38,28 @@ class MemexDeepCrawlDbRDD(sc: SparkContext,
3838
}
3939
}
4040
query.addFilterQuery(hostnameFilter)
41-
query.addFilterQuery(s"""${Constants.solr.PARENT}:"${escapeQueryChars(partition.group)}"""")
42-
query.addFilterQuery(s"${Constants.solr.CRAWL_ID}:${job.id}")
41+
query.addFilterQuery(s"""${Constants.storage.PARENT}:"${escapeQueryChars(partition.group)}"""")
42+
query.addFilterQuery(s"${Constants.storage.CRAWL_ID}:${job.id}")
4343
query.set("sort", sortBy)
4444
query.setRows(batchSize)
4545

46-
new SolrResultIterator[Resource](job.newCrawlDbSolrClient().crawlDb, query,
46+
new SolrResultIterator[Resource](job.newStorageProxy().getClient(), query,
4747
batchSize, classOf[Resource], closeClient = true, limit = topN)
4848
}
4949

5050
override protected def getPartitions: Array[Partition] = {
5151
val qry = new SolrQuery(generateQry)
5252

53-
qry.addFilterQuery(s"${Constants.solr.CRAWL_ID}:${job.id}")
53+
qry.addFilterQuery(s"${Constants.storage.CRAWL_ID}:${job.id}")
5454
qry.set("sort", sortBy)
5555
qry.set("group", true)
5656
qry.set("group.ngroups", true)
57-
qry.set("group.field", Constants.solr.PARENT)
57+
qry.set("group.field", Constants.storage.PARENT)
5858
qry.set("group.limit", 0)
5959
qry.setRows(maxGroups)
60-
val proxy = job.newCrawlDbSolrClient()
61-
val solr = proxy.crawlDb
62-
val groupRes = solr.query(qry).getGroupResponse.getValues.get(0)
60+
val proxy = job.newStorageProxy()
61+
val client = proxy.getClient()
62+
val groupRes = client.query(qry).getGroupResponse.getValues.get(0)
6363
val grps = groupRes.getValues
6464
MemexDeepCrawlDbRDD.LOG.info(s"selecting ${grps.size()} out of ${groupRes.getNGroups}")
6565
val res = new Array[Partition](grps.size())
@@ -75,8 +75,8 @@ class MemexDeepCrawlDbRDD(sc: SparkContext,
7575

7676
object MemexDeepCrawlDbRDD extends Loggable {
7777

78-
val DEFAULT_ORDER = Constants.solr.DISCOVER_DEPTH + " asc," + Constants.solr.SCORE + " desc"
79-
val DEFAULT_FILTER_QRY = Constants.solr.STATUS + ":" + ResourceStatus.UNFETCHED
78+
val DEFAULT_ORDER = Constants.storage.DISCOVER_DEPTH + " asc," + Constants.storage.SCORE + " desc"
79+
val DEFAULT_FILTER_QRY = Constants.storage.STATUS + ":" + ResourceStatus.UNFETCHED
8080
val DEFAULT_GROUPS = 10
8181
val DEFAULT_TOPN = 1000
8282
}

0 commit comments

Comments
 (0)