[document-conversion] Adds index document API

samir-patel · samir-patel · commit e7801268be18 · 2016-07-05T11:39:01.000-04:00
This commit adds the index_document API for the document
conversion service for the python SDK. It also includes
updated examples for using the index_document API.
diff --git a/examples/document_conversion_v1.py b/examples/document_conversion_v1.py
@@ -13,9 +13,83 @@
 with open(join(dirname(__file__), '../resources/example.html'), 'r') as document:
     config = {'conversion_target': DocumentConversionV1.NORMALIZED_HTML}
     print(document_conversion.convert_document(document=document, config=config, media_type='text/html')
-          .content.decode('utf-8'))
+          .content)
 
 # Example with JSON
 with open(join(dirname(__file__), '../resources/example.html'), 'r') as document:
     config['conversion_target'] = DocumentConversionV1.ANSWER_UNITS
     print(json.dumps(document_conversion.convert_document(document=document, config=config), indent=2))
+
+# Examples of index_document API
+print("########## Example of a dry run of index_document with only a document ##########")
+with open(join(dirname(__file__), '../resources/example.html'), 'r') as document:
+    config = {
+        'retrieve_and_rank': {
+            'dry_run':'true'
+        }
+    }
+    print(json.dumps(document_conversion.index_document(config=config, document=document), indent=2))
+
+print("########## Example of a dry run of index_document with only metadata ##########")
+config = {
+    'retrieve_and_rank': {
+        'dry_run':'true'
+    }
+}
+metadata = {
+    'metadata': [
+        {'name':'id', 'value':'12345'}
+    ]
+}
+print(json.dumps(document_conversion.index_document(config=config, metadata=metadata), indent=2))
+
+print("########## Example of a dry run of index_document with document and metadata ##########")
+with open(join(dirname(__file__), '../resources/example.html'), 'r') as document:
+    config = {
+        'retrieve_and_rank': {
+            'dry_run':'true'
+        }
+    }
+    metadata = {
+        'metadata': [
+            {'name':'id', 'value':'12345'}
+        ]
+    }
+    print(json.dumps(document_conversion.index_document(config=config, document=document, metadata=metadata), indent=2))
+
+print("########## Example of a dry run of index_document with document, metadata, and additional config for conversion ##########")
+with open(join(dirname(__file__), '../resources/example.html'), 'r') as document:
+    config = {
+        'convert_document': {
+            'normalized_html': {
+                'exclude_content': {"xpaths":["//body/div"]}
+            }
+        },
+        'retrieve_and_rank': {
+            'dry_run':'true'
+        }
+    }
+    metadata = {
+        'metadata': [
+            {'name':'id', 'value':'12345'}
+        ]
+    }
+    print(json.dumps(document_conversion.index_document(config=config, document=document, metadata=metadata), indent=2))
+
+print("########## Example of index_document with document, metadata (A service instance id, SOLR cluster id, and "
+      "a SOLR collection name must be provided from the Retrieve and Rank service in order to index) ##########")
+with open(join(dirname(__file__), '../resources/example.html'), 'r') as document:
+    config = {
+        'retrieve_and_rank': {
+            'dry_run':'false',
+            'service_instance_id':'YOUR RETRIEVE AND RANK SERVICE INSTANCE ID',
+            'cluster_id':'YOUR RETRIEVE AND RANK SERVICE SOLR CLUSTER ID',
+            'search_collection':'YOUR RETRIEVE AND RANK SERVICE SOLR SEARCH COLLECTION NAME'
+        }
+    }
+    metadata = {
+        'metadata': [
+            {'name':'id', 'value':'12345'}
+        ]
+    }
+    print(json.dumps(document_conversion.index_document(config=config, document=document, metadata=metadata), indent=2))
diff --git a/watson_developer_cloud/document_conversion_v1.py b/watson_developer_cloud/document_conversion_v1.py
@@ -37,3 +37,16 @@ def convert_document(self, document, config, media_type=None):
         accept_json = config['conversion_target'] == DocumentConversionV1.ANSWER_UNITS
         return self.request(method='POST', url='/v1/convert_document', files=files, params=params,
                             accept_json=accept_json)
+
+    def index_document(self, config, document=None, metadata=None, media_type=None):
+        if document == None and metadata == None:
+            raise AssertionError('Missing required parameters: document or metadata. At least one of those is required.')
+        params = {'version': self.version}
+        files = [('config', ('config.json', json.dumps(config), 'application/json'))]
+        if document != None:
+            filename = os.path.basename(document.name)
+            file_tuple = (filename, document, media_type) if media_type else (filename, document)
+            files.append(('file', file_tuple))
+        if metadata != None:
+            files.append(('metadata', ('metadata.json', json.dumps(metadata), 'application/json')))
+        return self.request(method='POST', url='/v1/index_document', files=files, params=params, accept_json=True)