fix #565 - add documentation to finish BulkLoadFromJdbcRaw

sammefford · sammefford · commit 91b9bac3c904 · 2017-10-17T18:36:17.000-06:00
diff --git a/src/main/java/com/marklogic/client/example/cookbook/datamovement/BulkLoadFromJdbcRaw.java b/src/main/java/com/marklogic/client/example/cookbook/datamovement/BulkLoadFromJdbcRaw.java
@@ -38,9 +38,6 @@
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.SerializationFeature;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
 import java.io.IOException;
 import java.sql.ResultSet;
 import java.sql.SQLException;
@@ -49,15 +46,37 @@
 import java.util.UUID;
 import javax.sql.DataSource;
 
+/** BulkLoadFromJdbcRaw shows one way to load rows as-is from JDBC (the source
+ * system) into MarkLogic (the target system), then transform (or harmonize)
+ * after everything has been ingested.  We pull each row from three tables:
+ * employees, salaries, and titles.  We convert employee rows to the Employee
+ * POJO and the salaries and titles rows to Jackson ObjectNode (JSON).  Then we
+ * write each row to MarkLogic Server (the target) as flat JSON.  Then we run a
+ * transform on each employee record to pull in the salaries and titles for that
+ * employee.  We call this "denormalization" and is a common way to translate
+ * relational data into documents.
+ *
+ * This example assumes there are no employees, salaries, or titles records in
+ * the target.  If this is run multiple times to capture changes, a step should
+ * be added to first delete all employees, salaries, and titles in the target
+ * system.  Otherwise old data (data updated or deleted from the source system)
+ * might get mixed with new data.
+ */
 public class BulkLoadFromJdbcRaw {
-  private static Logger logger = LoggerFactory.getLogger(BulkLoadFromJdbcRaw.class);
-
+  // during testing we run with a small data set and few threads but production
+  // systems would use many more threads and much larger batch sizes
   private static int threadCount = 3;
   private static int batchSize = 3;
 
+  // DataMovementManager helps orchestrate optimized writes across the
+  // MarkLogic cluster
   public static final DataMovementManager moveMgr =
     DatabaseClientSingleton.get().newDataMovementManager();
+
+  // ObjectMapper serializes objects to JSON for writing
   public static final ObjectMapper mapper = new ObjectMapper();
+
+  // the ISO 8601 format is expected for dates in MarkLogic Server
   public static final String ISO_8601_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSSXXX";
   public static final SimpleDateFormat dateFormat = new SimpleDateFormat(ISO_8601_FORMAT);
 
@@ -66,11 +85,15 @@ public static void main(String[] args) throws IOException, SQLException {
   }
 
   public void run() throws IOException, SQLException {
+    // first use a REST Admin user to install our transform
     installTransform(DatabaseClientSingleton.getRestAdmin());
+    // then load all the data from the source system to the target system
     load();
+    // then pull the salaries and titles into the employees
     transform();
   }
 
+  // take data from a JDBC ResultSet (row) and populate an Employee POJO object
   public void populateEmployee(ResultSet row, Employee e) throws SQLException {
     e.setEmployeeId(row.getInt("emp_no"));
     e.setBirthDate(Calendar.getInstance());
@@ -86,6 +109,7 @@ public void populateEmployee(ResultSet row, Employee e) throws SQLException {
     e.getHireDate().setTime(row.getDate("hire_date"));
   }
 
+  // take data from a JDBC ResultSet (row) and populate an ObjectNode (JSON) object
   public void populateSalary(ResultSet row, ObjectNode s) throws SQLException {
     s.put("employeeId", row.getInt("emp_no"));
     s.put("salary", row.getInt("salary"));
@@ -97,6 +121,7 @@ public void populateSalary(ResultSet row, ObjectNode s) throws SQLException {
     s.put("toDate", dateFormat.format(toDate.getTime()));
   }
 
+  // take data from a JDBC ResultSet (row) and populate an ObjectNode (JSON) object
   public void populateTitle(ResultSet row, ObjectNode t) throws SQLException {
     t.put("employeeId", row.getInt("emp_no"));
     t.put("title", row.getString("title"));
@@ -109,36 +134,57 @@ public void populateTitle(ResultSet row, ObjectNode t) throws SQLException {
   }
 
   public void load() throws IOException, SQLException {
+    // the JdbcTemplate is an easy way to run JDBC queries
     JdbcTemplate jdbcTemplate = new JdbcTemplate(getDataSource());
+
+    // the WriteBatcher is used to queue writes, batch them, and distribute
+    // them to all appropriate nodes in the MarkLogic cluster
     WriteBatcher wb = moveMgr.newWriteBatcher()
       .withBatchSize(batchSize)
       .withThreadCount(threadCount)
       .onBatchSuccess(batch -> System.out.println("Written: " + batch.getJobWritesSoFar()))
       .onBatchFailure((batch,exception) -> exception.printStackTrace());
     JobTicket ticket = moveMgr.startJob(wb);
+
+    // run a JDBC query and for each row returned, populate an Employee object
+    // then add it the the WriteBatcher with a uri in the /employees/ directory
     jdbcTemplate.query("SELECT * FROM employees",
       (RowCallbackHandler) row -> {
         Employee employee = new Employee();
         populateEmployee(row, employee);
         wb.addAs("/employees/" + employee.getEmployeeId() + ".json", employee);
       }
     );
+
+    // run a JDBC query and for each salary returned, populate an ObjectNode
+    // (JSON) object then add it the the WriteBatcher with a uri in the
+    // /salaries/ directory
     jdbcTemplate.query("SELECT * FROM salaries",
       (RowCallbackHandler) row -> {
         ObjectNode salary = mapper.createObjectNode();
         populateSalary(row, salary);
         wb.addAs("/salaries/" + UUID.randomUUID().toString() + ".json", salary);
       }
     );
+
+    // run a JDBC query and for each title returned, populate an ObjectNode
+    // (JSON) object then add it the the WriteBatcher with a uri in the
+    // /titles/ directory
     jdbcTemplate.query("SELECT * FROM titles",
       (RowCallbackHandler) row -> {
         ObjectNode title = mapper.createObjectNode();
         populateTitle(row, title);
         wb.addAs("/titles/" + UUID.randomUUID().toString() + ".json", title);
       }
     );
+
+    // finish all writes before proceeding
     wb.flushAndWait();
+
+    // free any resources used by the WriteBatcher
     moveMgr.stopJob(wb);
+
+    // double-check that the WriteBatcher job had no failures
     JobReport report = moveMgr.getJobReport(ticket);
     if ( report.getFailureBatchesCount() > 0 ) {
       throw new IllegalStateException("Encountered " +
@@ -147,21 +193,32 @@ public void load() throws IOException, SQLException {
   }
 
   public void transform() throws IOException, SQLException {
+    // search for all records in the /employees/ directory
     StructuredQueryDefinition query = new StructuredQueryBuilder().directory(1, "/employees/");
+
+    // the QueryBatcher efficiently paginates through matching batches from all
+    // appropriate nodes in the cluster then applies the transform on each batch
     QueryBatcher qb = moveMgr.newQueryBatcher(query)
       .withThreadCount(threadCount)
       .withBatchSize(batchSize)
       .onQueryFailure(throwable -> throwable.printStackTrace());
 
+    // the ApplyTransformListener performs the transform on each batch and
+    // overwrites the employee document with the results of the transform
     ApplyTransformListener transformListener = new ApplyTransformListener()
       .withTransform(new ServerTransform("BulkLoadFromJdbcRaw"))
       .withApplyResult(ApplyTransformListener.ApplyResult.REPLACE)
       .onBatchFailure((batch, throwable) -> throwable.printStackTrace());
+
+    // add the ApplyTransformListener to the QueryBatcher
     qb.onUrisReady(transformListener);
 
+    // start the job (across threadCount threads) and wait for it to finish
     JobTicket ticket = moveMgr.startJob(qb);
     qb.awaitCompletion();
     moveMgr.stopJob(ticket);
+
+    // double-check that the QueryBatcher job had no failures
     JobReport report = moveMgr.getJobReport(ticket);
     if ( report.getFailureBatchesCount() > 0 ) {
       throw new IllegalStateException("Encountered " +
@@ -175,6 +232,8 @@ private DataSource getDataSource() throws IOException {
   }
 
   public void installTransform(DatabaseClient client) {
+    // this transform seeks salaries and titles associated with a single
+    // employee record and injects them into the employee record
     String script =
       "function transform_function(context, params, content) { " +
       "  var uri = context.uri; " +
@@ -188,10 +247,19 @@ public void installTransform(DatabaseClient client) {
       "    for (let salary of salaries) { " +
       "      var employeeSalary = salary.toObject(); " +
       "      delete employeeSalary.employeeId; " +
-      "      employee.salaries = employeeSalary; " +
+      "      employee.salaries.push(employeeSalary); " +
       "    } " +
-      "    for ( var i=1; i <= fn.count(salaries); i++ ) { " +
-      "      xdmp.documentDelete(fn.baseUri(fn.subsequence(salaries, i, 1))) " +
+      "  } " +
+      "  var titles = cts.search(cts.andQuery([" +
+      "    cts.directoryQuery('/titles/'), " +
+      "    cts.jsonPropertyValueQuery('employeeId', employee.employeeId)" +
+      "  ])); " +
+      "  if ( fn.count(titles) > 0 ) { " +
+      "    employee.titles = new Array(); " +
+      "    for (let title of titles) { " +
+      "      var employeeTitle = title.toObject(); " +
+      "      delete employeeTitle.employeeId; " +
+      "      employee.titles.push(employeeTitle); " +
       "    } " +
       "  } " +
       "  return employee; " +