GA_ABC_MDS/main2duc.py at master · doudoucao/GA_ABC_MDS · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# coding=utf-8
import os

PATH = "./main"
PATH_NEW = "./DUC2007"

# walk = os.walk(PATH)
# for root, dirs, files in walk:
#     if dirs:
#         print dirs
#         for dir in dirs:
#             path = os.path.join(PATH_NEW, dir)
#             os.makedirs(path)
#     for name in files:
#         print os.path.join(root, name)


import os
import codecs
import re
import nltk
from nlp_util import *
from conf import DUC_DIR
from JS import *

__doc__ = '''
    读取DUC2007文档,以及对DUC2007进行预处理
'''


def doc_label_filter(doc):
    """
    过滤文档的标签
    """
    doc = doc.replace('\n    ', ' ').replace('\t', '').replace('.....', '')
    doc = doc.replace('\n', ' ').replace('-', '').replace('...', '')
    doc = re.sub(r'<.*?>', '', doc)
    doc = re.sub(r'&.*?;', '', doc)
    doc = re.sub(r'\(.*?\)', '', doc)
    return doc


def get_text(doc):
    """
    获取文本的内容
    """
    doc = doc.replace('\n', ' ').replace('\t', '')
    m = re.findall('<TEXT>.*?</TEXT>', doc)
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw = m[0]
    raw = re.sub(r'<.*?>', '', raw)
    sentences = tokenizer.tokenize(raw.strip())
    doc_sentences = []
    for sentence in sentences:
        if len(sentence.split('--', 1)) > 1:
            sentence = sentence.split('--', 1)[1]
        if len(sentence.split('_', 1)) > 1:
            sentence = sentence.split('_', 1)[1]
        sentence = re.sub('\(.*?\)', '', sentence)
        doc = re.sub(r'&.*?;', '', doc)
        if len(sentence.split()) > 13:
            doc_sentences.append(sentence)
    return doc_sentences


def load_docset(docset_path):
    docset_id = os.path.split(docset_path)[1]
    docs = []
    walk = os.walk(docset_path)
    for root, dirs, files in walk:
        for name in files:
            # print os.path.join(root, name)
            doc = codecs.open(os.path.join(root, name), 'r', 'utf-8', 'ignore').read()
            dir = os.path.join(root, name).split('/')[2]
            path_1 = os.path.join(PATH_NEW, dir)
            if not os.path.exists(path_1):
                os.makedirs(path_1)
            doc_sentences = get_text(doc)
            # with open(os.path.join(path_1, name), 'a') as f:
            #     for sentence in doc_sentences:
            #         f.write(sentence+'\n')
            # print splitSentence(doc)
            docs.append(doc_sentences)

    return docset_id, docs


def load_docsets(duc_dir):
    docset_paths = [os.path.join(duc_dir, fname) for fname in os.listdir(duc_dir)]
    docset_paths = [path for path in docset_paths if os.path.isdir(path)]
    docsets = {}
    for docset_path in docset_paths:
        docset_id, docs = load_docset(docset_path)
        docsets[docset_id] = docs

    return docsets


def js_duc_models():
    docsets = load_docsets(PATH)
    topics_id = docsets.keys()
    for topic_id in topics_id:
        docs = docsets[topic_id]
        sentences = []
        for doc in docs:
            sentences.extend(doc)
        doc_freq = compute_tf(sentences)
        for file in os.listdir('./models'):
            if file.split('.')[0] == topic_id[:-1]:
                print file
                raw = codecs.open(os.path.join('./models', file), 'r', 'utf-8').read()
                print js_divergence(splitSentence(raw), doc_freq)


# load_docsets(PATH)
# print './main/D0701A/aaa.txt'.split('/')[2]
js_duc_models()