-
Notifications
You must be signed in to change notification settings - Fork 261
hbase scalding Store based on maple/storehaus #404
base: develop
Are you sure you want to change the base?
Changes from 8 commits
1db3921
13e7a6f
0555010
e488903
849b748
f1b3998
205be31
6fbe3cd
c56b2a4
ff90ee7
cfb97d2
ff0f97d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,103 @@ | ||
| package com.twitter.summingbird.example | ||
|
|
||
| import com.twitter.scalding.{ Hdfs, TextLine } | ||
| import com.twitter.summingbird.batch.{ Batcher, Timestamp, BatchID } | ||
| import com.twitter.summingbird.scalding.{ InitialBatchedStore, Scalding, HBaseVersionedStore } | ||
| import com.twitter.summingbird.scalding.state.HDFSState | ||
| import com.twitter.summingbird.scalding.store.VersionedStore | ||
| import com.twitter.summingbird.{ Platform, Producer, TimeExtractor } | ||
| import com.twitter.storehaus.ReadableStore | ||
| import com.twitter.util.Await | ||
| import java.util.Date | ||
| import org.apache.hadoop.conf.Configuration | ||
|
|
||
| /** | ||
| * The following object contains code to execute a similar Scalding | ||
| * job to the WordCount job defined in ExampleJob.scala. This job works | ||
| * on plain text files, as opposed to tweet Status objects. | ||
| * The example job uses a Store on top of HBase. This does require you to | ||
| * set up a local running hbase with zookeeper. | ||
| * | ||
| * @author Josh Buffum | ||
| * @author Riju Kallivalappil | ||
| */ | ||
|
|
||
| object ScaldingRunner { | ||
| final val MillisInHour = 60 * 60 * 1000 | ||
|
|
||
| /** | ||
| * Directory location to store state and read input file. | ||
| */ | ||
| final val JobDir = "/user/mydir/wordcount" | ||
|
|
||
| /** | ||
| * pull in the serialization injections and WordCount job | ||
| */ | ||
| import Serialization._ | ||
|
|
||
| implicit val batcher = Batcher.ofHours(1) | ||
|
|
||
| // taken from ExampleJob | ||
| def tokenize(text: String) : TraversableOnce[String] = | ||
| text.toLowerCase | ||
| .replaceAll("[^a-zA-Z0-9\\s]", "") | ||
| .split("\\s+") | ||
|
|
||
| /** | ||
| * The actual Summingbird job. Works against text instead of tweet Status | ||
| */ | ||
| def wordCount[P <: Platform[P]](source: Producer[P, String], store: P#Store[String, Long]) = { | ||
| source | ||
| .filter(_ != null) | ||
| .flatMap { text: String => tokenize(text).map(_ -> 1L) } | ||
| .sumByKey(store) | ||
| } | ||
|
|
||
| // Always use an hour before the current time as the batch id. | ||
| // The storm job uses the current hour. This way we can get the "merger" to work across 2 batches | ||
| implicit val timeOf: TimeExtractor[String] = TimeExtractor(_ => new Date().getTime - MillisInHour) | ||
|
|
||
| val now = System.currentTimeMillis | ||
| val waitingState = HDFSState(JobDir + "/waitstate", startTime = Some(Timestamp(now - 2 * MillisInHour)), | ||
| numBatches = 3) | ||
|
|
||
| // read text lines in input.txt as job input | ||
| val src = Producer.source[Scalding, String](Scalding.pipeFactoryExact(_ => TextLine(JobDir + "/input.txt"))) | ||
|
|
||
| /** | ||
| * Create the HBaseVersionedStore. Results from the Scalding job will be written | ||
| * as String => (BatchID, Long) pairs into a HBase cluster defined in a Zookeeper | ||
| * quorum at "localhost" in a table "wordcountJob" | ||
| */ | ||
| val versionedStore = HBaseVersionedStore[String, Long] ( | ||
| Seq("localhost"), | ||
| "wordcountJob" | ||
| ) | ||
|
|
||
| /** | ||
| * wrap the HBaseVersionedStore with an InitialBatchedStore to take care of the early batches | ||
| */ | ||
| val store = new InitialBatchedStore(batcher.currentBatch - 2L, versionedStore) | ||
| val mode = Hdfs(false, new Configuration()) | ||
|
|
||
| /** | ||
| * main | ||
| * Create the Scalding job and run it | ||
| */ | ||
| def runJob(args: Array[String]) { | ||
| val job = Scalding("wordcountJob") | ||
| job.run(waitingState, mode, job.plan(wordCount[Scalding](src, store))) | ||
| } | ||
|
|
||
| /** | ||
| * lookup a Key value in the HBase store | ||
| */ | ||
| def lookup(key: String) : Option[(BatchID, Long)] = { | ||
| val reader = versionedStore.toReadableStore | ||
|
|
||
| Await.result { | ||
| reader.get(key) | ||
| } | ||
| } | ||
|
|
||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,138 @@ | ||
| package com.twitter.summingbird.scalding | ||
|
|
||
| import cascading.flow.FlowDef | ||
| import cascading.tap.Tap | ||
| import cascading.tuple.Fields | ||
| import com.twitter.algebird.monad.Reader | ||
| import com.twitter.bijection.hbase.HBaseBijections.ImmutableBytesWritableBijection | ||
| import com.twitter.bijection.Injection | ||
| import com.twitter.bijection.Inversion.attempt | ||
| import com.twitter.maple.hbase.{HBaseScheme, HBaseTap} | ||
| import com.twitter.scalding.{AccessMode, Dsl, Mappable, Mode, Source, TupleConverter, TupleSetter, TypedPipe} | ||
| import com.twitter.scalding.typed.TypedSink | ||
| import com.twitter.storehaus.hbase.HBaseByteArrayStore | ||
| import com.twitter.storehaus.ReadableStore | ||
| import com.twitter.summingbird.batch.{Batcher, BatchID} | ||
| import com.twitter.util.{Await, Future} | ||
| import org.apache.hadoop.conf.Configuration | ||
| import org.apache.hadoop.hbase.io.ImmutableBytesWritable | ||
| import org.apache.hadoop.mapred.{ JobConf, OutputCollector, RecordReader } | ||
| import scala.util.{Failure, Success} | ||
|
|
||
| import Injection._ | ||
|
|
||
| /** | ||
| * Scalding implementation of the batch read and write components of a | ||
| * store that uses the VersionedKeyValSource from scalding-commons. | ||
| * | ||
| * @author Josh Buffum | ||
| */ | ||
|
|
||
|
|
||
| object HBaseVersionedStore { | ||
|
|
||
| def apply[K, V](quorum: Seq[String], | ||
| table: String)( | ||
| implicit | ||
| batcher: Batcher, | ||
| injection: Injection[(K, (BatchID,V)), (Array[Byte], Array[Byte])], | ||
| keyInj: Injection[K, Array[Byte]], | ||
| valueInj: Injection[(BatchID,V), Array[Byte]], | ||
| ordering: Ordering[K]): HBaseVersionedStore[K, V, K, (BatchID,V)] = { | ||
| new HBaseVersionedStore[K, V, K, (BatchID,V)](quorum, table, batcher)( | ||
| { case (batchID, (k, v)) => (k, (batchID.next, v)) })( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you comment the assymmetry on the .next? (I'm confused actually).
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry for the delay. I'll add the comment here. Just for reference, I borrowed the logic here from https://github.com/twitter/summingbird/blob/develop/summingbird-scalding/src/main/scala/com/twitter/summingbird/scalding/store/VersionedStore.scala. The comment I'll add will look very similar to the description of the VersionedStore object |
||
| { case (k, (batchID, v)) => (batchID, (k, v)) }) | ||
| } | ||
| } | ||
|
|
||
| class HBaseVersionedStore [K, V, K2, V2](quorum: Seq[String], | ||
| table: String, | ||
| override val batcher: Batcher) | ||
| (pack: (BatchID, (K, V)) => (K2, V2)) | ||
| (unpack: ((K2, V2)) => (BatchID, (K,V)))( | ||
| implicit | ||
| injection: Injection[(K2, V2), (Array[Byte], Array[Byte])], | ||
| keyInj: Injection[K, Array[Byte]], | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shouldn't this be K2? |
||
| valueInj: Injection[V2, Array[Byte]], override val ordering: Ordering[K]) extends BatchedScaldingStore[K, V] | ||
| { | ||
| val KeyColumnName = "key" | ||
| val ValColumnName = "value" | ||
| val ColumnFamily = "versionedstore" | ||
|
|
||
| val scheme = new HBaseScheme(new Fields(KeyColumnName), ColumnFamily, new Fields(ValColumnName)) | ||
|
|
||
| implicit lazy val byteArray2BytesWritableInj : Injection[Array[Byte], ImmutableBytesWritable] = fromBijection[Array[Byte], ImmutableBytesWritable](ImmutableBytesWritableBijection[Array[Byte]]) | ||
|
|
||
| implicit def kvpInjection: Injection[(K2, V2), (ImmutableBytesWritable,ImmutableBytesWritable)] = { | ||
| Injection.connect[(K2,V2), (Array[Byte],Array[Byte]), (ImmutableBytesWritable,ImmutableBytesWritable)] | ||
| } | ||
|
|
||
|
|
||
| // this is only used for client queries and does not need to be serialized out | ||
| // during the scalding job | ||
| @transient val hbaseStore = HBaseByteArrayStore (quorum, table, ColumnFamily, ValColumnName, true) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does it need to be a val? Can it be a def?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. or perhaps a lazy val (and also transient)? |
||
| .convert[K,V2](keyInj)(valueInj) | ||
|
|
||
| /** | ||
| * Exposes a stream with the (K,V) pairs from the highest batchID less than | ||
| * the input "exclusiveUB" batchID. See readVersions() for the creation of this stream | ||
| * This method is called by BatchedScaldingStore.merge | ||
| */ | ||
| override def readLast(exclusiveUB: BatchID, mode: Mode): Try[(BatchID, FlowProducer[TypedPipe[(K, V)]])] = { | ||
| Right((exclusiveUB, readVersions(exclusiveUB))) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is not quite right, is it? It will always return a Right. What if there is no data on disk? It is not clear that this will not have data loss.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you need to store some state somewhere that records the last written batch. |
||
| } | ||
|
|
||
|
|
||
| def readVersions(exclusiveUB: BatchID): FlowProducer[TypedPipe[(K, V)]] = Reader { (flowMode: (FlowDef, Mode)) => | ||
| val mappable = new HBaseVersionedSource[K2, V2](table, scheme) | ||
|
|
||
| val filtered = TypedPipe.from(mappable)(flowMode._1, flowMode._2) | ||
| .map{x: (K2, V2) => unpack(x)} | ||
| .filter{ _._1 < exclusiveUB } // (BatchID, (K,V) | ||
| .map{unpacked: (BatchID,(K,V)) => (unpacked._2._1,(unpacked._1,unpacked._2._2))} // (K, (BatchID,V) | ||
|
|
||
| implicit def batchOrderer = Ordering.by[(BatchID,V),BatchID](_._1) | ||
|
|
||
| filtered | ||
| .group | ||
| .max | ||
| .map{x: (K, (BatchID,V)) => (x._1, x._2._2)} | ||
| } | ||
|
|
||
|
|
||
| /** | ||
| * write the (K, V) pairs aggregated up to batchID (inclusive) into the | ||
| * BatchedScaldingStore. In our case, this BatchedScaldingStore uses HBase | ||
| * as the mechanism to actually store data | ||
| * | ||
| * The data is written in serialized pairs of (K, (BatchID, V)) | ||
| */ | ||
| override def writeLast(batchID: BatchID, lastVals: TypedPipe[(K, V)])(implicit flowDef: FlowDef, mode: Mode): Unit = { | ||
| import Dsl._ | ||
|
|
||
| lastVals.map{x: (K,V) => Injection[(K2,V2),(ImmutableBytesWritable,ImmutableBytesWritable)](pack(batchID, x))} | ||
| .toPipe(new Fields(KeyColumnName,ValColumnName)) | ||
| .write(new HBaseVersionedSource[K2, V2](table, scheme)) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We also need to write some state that notes what we have finished batchID. |
||
| } | ||
|
|
||
|
|
||
| def toReadableStore: ReadableStore[K,V2] = { | ||
| hbaseStore | ||
| } | ||
|
|
||
| } | ||
|
|
||
|
|
||
| class HBaseVersionedSource[K, V](table: String, | ||
| scheme: HBaseScheme )( | ||
| implicit injection: Injection[(K, V), (Array[Byte], Array[Byte])]) | ||
| extends Source with Mappable[(K,V)] with TypedSink[(K,V)] | ||
| { | ||
| override def converter[U >: (K, V)] = TupleConverter.asSuperConverter[(K, V), U](TupleConverter.of[(K, V)]) | ||
|
|
||
| override def setter[U <: (K, V)] = TupleSetter.asSubSetter[(K, V), U](TupleSetter.of[(K,V)]) | ||
|
|
||
| override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_,_,_] = { | ||
| (new HBaseTap(table, scheme)).asInstanceOf[Tap[JobConf, RecordReader[_,_], OutputCollector[_,_]]] | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do you need this? Doesn't it come for free with the keyInj + valueInj?