-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsettings.py
More file actions
45 lines (37 loc) · 1.5 KB
/
settings.py
File metadata and controls
45 lines (37 loc) · 1.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# -*- coding: utf-8 -*-
DB_NAME = "VNLaw_development"
USER_NAME = "anhtt"
HOST_NAME = "localhost"
HOST_PASS = "12345678"
INPUT_DATA_FILE_NAME = "./input/articles.tsv"
PROCESSED_INPUT_DATA_FILE_NAME = "./topmine/input/data.txt"
INPUT_STOPWORDS_FILE_NAME = "./input/vietnamese-stopwords.txt"
PROCESSED_INPUT_STOPWORDS_FILE_NAME = "./topmine/topmine_src/stopwords.txt"
PARAM_BUILD_DATA = "build_data"
PARAM_BUILD_SW = "build_stopwords"
REDUNDANT_SYMBOL = ["\\t", "\\r", "\\n", "##", "\\", "”",
"**___________________**", "**", "---", "_", "|", "…",
"---|---|---|---|---"]
REDUNDANT_PATTERN = ["[a-z]\)", "đ\)", "&",
"(\d+\/)*([A-Z\d\%\-a-z]+)+(\&[a-z]+\;([a-z]+\=[A-Za-z\d]+)+)+\"",
"([a-z]+\=\"[a-z_]+\"\&[a-z;]+)+",
"((\)\;)+[a-z\-\:\;\"]+)+", "([a-z]+\=\".*\-)+",
"(right:[a-z\d]+\;)(color\:[a-z\(\d\,\s]+)"]
REDUNDANT_STRING_PATTERN = ["điều \d{1,2}", "khoản \d{1,2}",
"chương [ivxlcdm]+", "điểm \d{1,2}",
"phần thứ [a-zâấưăáảáíờộơ ]+", "mục \d{1,2}",
"cộng hoà xã hội chủ nghĩa việt nam", "độc lập - tự do - hạnh phúc",
"\d{1,2}\\\.", "[\w]\)", "đ\)"]
FORBIDEN_SYMBOL_TOPICS = ["_", "%", "|", "\\", ":"]
QUERY_INSERT_AI_TO_ARTICLES = """
INSERT INTO articles(article_id) VALUES(%s);
"""
QUERY_INSERT_INTO_ARTICLE_TOPICS = """
INSERT INTO article_topics(topic_id, article_id) VALUES(%s, %s);
"""
QUERY_INSERT_TO_TOPICS = """
INSERT INTO topics(id, value) VALUES(%s, %s);
"""
QUERY_SELECT_ARTICLE_ID = """
SELECT article_id FROM articles;
"""