@@ -21,7 +21,7 @@ Document Conversion Service and upload it to the Retrieve and Rank Service to ma
2121*/
2222
2323var watson = require ( 'watson-developer-cloud' ) ;
24- var async = require ( 'async' ) ;
24+ var async = require ( 'async' ) ;
2525var fs = require ( 'fs' ) ;
2626
2727/*
@@ -57,17 +57,25 @@ var solrClient = retrieve.createSolrClient({
5757
5858async . waterfall ( [
5959
60- function convert ( done ) {
60+ function convert ( done ) {
6161 // convert a single document
6262 document_conversion . convert ( {
6363 // (JSON) ANSWER_UNITS, NORMALIZED_HTML, or NORMALIZED_TEXT
6464 file : fs . createReadStream ( __dirname + inputDocument ) ,
65- conversion_target : document_conversion . conversion_target . ANSWER_UNITS
66- } , function ( err , response ) {
65+ conversion_target : document_conversion . conversion_target . ANSWER_UNITS ,
66+ config : {
67+ html_to_html : {
68+ specify_content_to_extract : {
69+ enabled : true ,
70+ xpaths : [ '//h3' ]
71+ }
72+ }
73+ }
74+ } , function ( err , response ) {
6775 if ( err ) {
6876 console . error ( err ) ;
6977 } else {
70- done ( null , response ) ;
78+ done ( null , response ) ;
7179 }
7280 } ) ;
7381 } ,
@@ -76,13 +84,13 @@ async.waterfall([
7684 console . log ( 'Indexing a document...' ) ;
7785 var doc = mapAnswerUnits2SolrDocs ( response ) ;
7886 solrClient . add ( doc , function ( err ) {
79- if ( err ) {
87+ if ( err ) {
8088 console . log ( 'Error indexing document: ' + err ) ;
8189 done ( ) ;
8290 } else {
8391 console . log ( 'Indexed a document.' ) ;
8492 solrClient . commit ( function ( err ) {
85- if ( err ) {
93+ if ( err ) {
8694 console . log ( 'Error committing change: ' + err ) ;
8795 } else {
8896 console . log ( 'Successfully committed changes.' ) ;
@@ -99,10 +107,12 @@ async.waterfall([
99107 // This query searches for the term 'psychological' in the content_text field.
100108 // For a wildcard query use:
101109 // query.q({ '*' : '*' });
102- query . q ( { 'content_text' : 'psychological' } ) ;
110+ query . q ( {
111+ 'content_text' : 'psychological'
112+ } ) ;
103113
104114 solrClient . search ( query , function ( err , searchResponse ) {
105- if ( err ) {
115+ if ( err ) {
106116 console . log ( 'Error searching for documents: ' + err ) ;
107117 } else {
108118 console . log ( 'Found ' + searchResponse . response . numFound + ' document(s).' ) ;
@@ -116,7 +126,7 @@ async.waterfall([
116126function mapAnswerUnits2SolrDocs ( data ) {
117127 var answerUnits = data . answer_units ;
118128 var solrDocList = [ ] ;
119- answerUnits . forEach ( function ( value ) {
129+ answerUnits . forEach ( function ( value ) {
120130 var solrDoc = convertAnswerUnit2SolrDoc ( value ) ;
121131 solrDocList . push ( solrDoc ) ;
122132 } ) ;
@@ -126,9 +136,15 @@ function mapAnswerUnits2SolrDocs(data) {
126136function convertAnswerUnit2SolrDoc ( au ) {
127137 var solrDoc ;
128138 var auContents = au . content ;
129- auContents . forEach ( function ( auContent ) {
130- if ( auContent . media_type === 'text/plain' ) {
131- solrDoc = { id : au . id , title : au . title , type : au . type , media_type : auContent . media_type , content_text : auContent . text } ;
139+ auContents . forEach ( function ( auContent ) {
140+ if ( auContent . media_type === 'text/plain' ) {
141+ solrDoc = {
142+ id : au . id ,
143+ title : au . title ,
144+ type : au . type ,
145+ media_type : auContent . media_type ,
146+ content_text : auContent . text
147+ } ;
132148 }
133149 } ) ;
134150 return solrDoc ;
0 commit comments