-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
89 lines (71 loc) · 2.52 KB
/
utils.py
File metadata and controls
89 lines (71 loc) · 2.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import logging
import re
import string
import unicodedata
def manage_logger(logfile_name):
# Configure logging for a specific logger (unique per instance)
logger = logging.getLogger(logfile_name)
logger.setLevel(logging.INFO) # Set the logging level to DEBUG
# Check if the logger already has handlers to avoid adding multiple handlers
if not logger.handlers: # This ensures we don't add handlers multiple times
# Create a file handler
file_handler = logging.FileHandler(logfile_name)
file_handler.setLevel(
logging.DEBUG
) # Set the level for the file handler to DEBUG
# Create a console handler
console_handler = logging.StreamHandler()
console_handler.setLevel(
logging.DEBUG
) # Set the level for the console handler to INFO
# Create a formatter and set it for both handlers
formatter = logging.Formatter(
"%(asctime)s - %(levelname)s - %(message)s"
)
file_handler.setFormatter(formatter)
console_handler.setFormatter(formatter)
# Add the handlers to the logger
logger.addHandler(file_handler)
logger.addHandler(console_handler)
# Prevent log messages from propagating to the root logger
logger.propagate = False
return logger
def clean_value(formatted_name):
formatted_name = formatted_name.lower()
# Replace dash-like characters between initials or names with space
formatted_name = re.sub(r"[-‐‑‒–—―⁃﹘﹣-]", " ", formatted_name)
# Separate joined initials (e.g., J.-L. → J L)
formatted_name = re.sub(r"\b([A-Z])\.\-?([A-Z])\.\b", r"\1 \2", formatted_name)
# Remove remaining periods (e.g., J. → J)
formatted_name = formatted_name.replace(".", " ")
# Remove any leftover punctuation
formatted_name = formatted_name.translate(
str.maketrans("", "", string.punctuation)
)
# Normalize whitespace
formatted_name = re.sub(r"\s+", " ", formatted_name).strip()
return formatted_name
def normalize_title(title):
lowercase_words = {
"and",
"or",
"but",
"a",
"an",
"the",
"as",
"at",
"by",
"for",
"in",
"of",
"on",
"to",
"up",
"with",
}
words = title.lower().split()
normalized_title = [words[0].capitalize()] + [
word if word in lowercase_words else word.capitalize() for word in words[1:]
]
return " ".join(normalized_title)