Skip to content

Commit 4590b23

Browse files
Merge pull request #138 from USCDataScience/docs
Added documentation to methods
2 parents b5bd593 + 716a223 commit 4590b23

File tree

6 files changed

+313
-26
lines changed

6 files changed

+313
-26
lines changed

tika/detector.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,20 @@
1919
from .tika import detectType1, callServer, ServerEndpoint
2020

2121
def from_file(filename):
22+
'''
23+
Detects MIME type of specified file
24+
:param filename: file whose type needs to be detected
25+
:return: MIME type
26+
'''
2227
jsonOutput = detectType1('type', filename)
2328
return jsonOutput[1]
2429

2530
def from_buffer(string):
31+
'''
32+
Detects MIME type of the buffered content
33+
:param string: buffered content whose type needs to be detected
34+
:return:
35+
'''
2636
status, response = callServer('put', ServerEndpoint, '/detect/stream', string,
2737
{'Accept': 'text/plain'}, False)
2838
return response

tika/language.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,20 @@
1919
from .tika import detectLang1, callServer, ServerEndpoint
2020

2121
def from_file(filename):
22+
'''
23+
Detects language of the file
24+
:param filename: path to file whose language needs to be detected
25+
:return:
26+
'''
2227
jsonOutput = detectLang1('file', filename)
2328
return jsonOutput[1]
2429

2530
def from_buffer(string):
31+
'''
32+
Detects language of content in the buffer
33+
:param string: buffered data
34+
:return:
35+
'''
2636
status, response = callServer('put', ServerEndpoint, '/language/string', string,
2737
{'Accept': 'text/plain'}, False)
2838
return response

tika/parser.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,15 @@
2121
import json
2222

2323
def from_file(filename, serverEndpoint=ServerEndpoint, xmlContent=False):
24+
'''
25+
Parses a file for metadata and content
26+
:param filename: path to file which needs to be parsed
27+
:param serverEndpoint: Server endpoint url
28+
:param xmlContent: Whether or not XML content be requested.
29+
Default is 'False', which results in text content.
30+
:return: dictionary having 'metadata' and 'content' keys.
31+
'content' has a str value and metadata has a dict type value.
32+
'''
2433
if not xmlContent:
2534
jsonOutput = parse1('all', filename, serverEndpoint)
2635
else:
@@ -29,16 +38,29 @@ def from_file(filename, serverEndpoint=ServerEndpoint, xmlContent=False):
2938

3039

3140
def from_buffer(string, serverEndpoint=ServerEndpoint, xmlContent=False):
41+
'''
42+
Parses the content from buffer
43+
:param string: Buffer value
44+
:param serverEndpoint: Server endpoint. This is optional
45+
:param xmlContent: Whether or not XML content be requested.
46+
Default is 'False', which results in text content.
47+
:return:
48+
'''
3249
if not xmlContent:
3350
status, response = callServer('put', serverEndpoint, '/rmeta/text', string,
3451
{'Accept': 'application/json'}, False)
3552
else:
3653
status, response = callServer('put', serverEndpoint, '/rmeta/xml', string,
37-
{'Accept': 'application/json'}, False)
38-
54+
{'Accept': 'application/json'}, False)
55+
3956
return _parse((status,response))
4057

4158
def _parse(jsonOutput):
59+
'''
60+
Parses JSON response from Tika REST API server
61+
:param jsonOutput: JSON output from Tika Server
62+
:return: a dictionary having 'metadata' and 'content' values
63+
'''
4264
parsed={}
4365
if not jsonOutput:
4466
return parsed
@@ -62,7 +84,7 @@ def _parse(jsonOutput):
6284
if n != "X-TIKA:content":
6385
if n in parsed["metadata"]:
6486
if not isinstance(parsed["metadata"][n], list):
65-
parsed["metadata"][n] = [parsed["metadata"][n]]
87+
parsed["metadata"][n] = [parsed["metadata"][n]]
6688
parsed["metadata"][n].append(js[n])
6789
else:
6890
parsed["metadata"][n] = js[n]

0 commit comments

Comments
 (0)