-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmanindex.py
More file actions
103 lines (84 loc) · 3.99 KB
/
manindex.py
File metadata and controls
103 lines (84 loc) · 3.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# ====================================================================
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ====================================================================
#
# Author: Erik Hatcher
#
# to index all man pages on $MANPATH or /usr/share/man:
# python manindex.py pages
# ====================================================================
import os, re, sys, lucene
from subprocess import *
from java.io import File
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.document import Document, Field, StringField, TextField
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version
def indexDirectory(dir):
for name in os.listdir(dir):
path = os.path.join(dir, name)
if os.path.isfile(path):
indexFile(dir, name)
def indexFile(dir, filename):
path = os.path.join(dir, filename)
print " File: ", filename
if filename.endswith('.gz'):
child = Popen('gunzip -c ' + path + ' | groff -t -e -E -mandoc -Tascii | col -bx', shell=True, stdout=PIPE, cwd=os.path.dirname(dir)).stdout
command, section = re.search('^(.*)\.(.*)\.gz$', filename).groups()
else:
child = Popen('groff -t -e -E -mandoc -Tascii ' + path + ' | col -bx',
shell=True, stdout=PIPE, cwd=os.path.dirname(dir)).stdout
command, section = re.search('^(.*)\.(.*)$', filename).groups()
data = child.read()
err = child.close()
if err:
raise RuntimeError, '%s failed with exit code %d' %(command, err)
matches = re.search('^NAME$(.*?)^\S', data,
re.MULTILINE | re.DOTALL)
name = matches and matches.group(1) or ''
matches = re.search('^(?:SYNOPSIS|SYNOPSYS)$(.*?)^\S', data,
re.MULTILINE | re.DOTALL)
synopsis = matches and matches.group(1) or ''
matches = re.search('^(?:DESCRIPTION|OVERVIEW)$(.*?)', data,
re.MULTILINE | re.DOTALL)
description = matches and matches.group(1) or ''
doc = Document()
doc.add(Field("command", command, StringField.TYPE_STORED))
doc.add(Field("section", section, StringField.TYPE_STORED))
doc.add(Field("name", name.strip(), TextField.TYPE_STORED))
doc.add(Field("synopsis", synopsis.strip(), TextField.TYPE_STORED))
doc.add(Field("keywords", ' '.join((command, name, synopsis, description)),
TextField.TYPE_NOT_STORED))
doc.add(Field("filename", os.path.abspath(path), StringField.TYPE_STORED))
writer.addDocument(doc)
if __name__ == '__main__':
if len(sys.argv) != 2:
print "Usage: python manindex.py <index dir>"
else:
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
directory = SimpleFSDirectory(File(sys.argv[1]))
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
analyzer = LimitTokenCountAnalyzer(analyzer, 10000)
config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
writer = IndexWriter(directory, config)
manpath = os.environ.get('MANPATH', '/usr/share/man').split(os.pathsep)
for dir in manpath:
print "Crawling", dir
for name in os.listdir(dir):
path = os.path.join(dir, name)
if os.path.isdir(path):
indexDirectory(path)
writer.commit()
writer.close()