88
99
1010def validate_file_path (
11- file_path : str ,
12- preexisting_file : bool = False ,
13- writable : bool = False
11+ file_path : str , preexisting_file : bool = False , writable : bool = False
1412) -> bool :
1513 """
1614 Validate a file path.
17-
15+
1816 Args:
1917 file_path: Path to validate
2018 preexisting_file: If True, check that file exists
2119 writable: If True, check that directory is writable
22-
20+
2321 Returns:
2422 True if valid
25-
23+
2624 Raises:
2725 ValueError: If path is empty or invalid
2826 FileNotFoundError: If preexisting_file=True and file doesn't exist
@@ -50,10 +48,10 @@ def validate_file_path(
5048def is_supported_format (file_path : str ) -> bool :
5149 """
5250 Check if file format is supported for poster extraction.
53-
51+
5452 Args:
5553 file_path: Path to poster file
56-
54+
5755 Returns:
5856 True if PDF, JPG, JPEG, or PNG
5957 """
@@ -64,10 +62,10 @@ def is_supported_format(file_path: str) -> bool:
6462def get_poster_format (file_path : str ) -> Optional [str ]:
6563 """
6664 Get the format type of a poster file.
67-
65+
6866 Args:
6967 file_path: Path to poster file
70-
68+
7169 Returns:
7270 "pdf", "image", or None if unsupported
7371 """
@@ -82,16 +80,16 @@ def get_poster_format(file_path: str) -> Optional[str]:
8280def normalize_text (text : str ) -> str :
8381 """
8482 Normalize text for comparison.
85-
83+
8684 Handles:
8785 - Unicode normalization (NFKD)
8886 - Whitespace consolidation
8987 - Quote unification
9088 - Dash normalization
91-
89+
9290 Args:
9391 text: Input text
94-
92+
9593 Returns:
9694 Normalized text
9795 """
@@ -102,9 +100,21 @@ def normalize_text(text: str) -> str:
102100
103101 # Whitespace normalization
104102 space_chars = [
105- "\xa0 " , "\u2000 " , "\u2001 " , "\u2002 " , "\u2003 " , "\u2004 " ,
106- "\u2005 " , "\u2006 " , "\u2007 " , "\u2008 " , "\u2009 " , "\u200a " ,
107- "\u202f " , "\u205f " , "\u3000 " ,
103+ "\xa0 " ,
104+ "\u2000 " ,
105+ "\u2001 " ,
106+ "\u2002 " ,
107+ "\u2003 " ,
108+ "\u2004 " ,
109+ "\u2005 " ,
110+ "\u2006 " ,
111+ "\u2007 " ,
112+ "\u2008 " ,
113+ "\u2009 " ,
114+ "\u200a " ,
115+ "\u202f " ,
116+ "\u205f " ,
117+ "\u3000 " ,
108118 ]
109119 for space in space_chars :
110120 text = text .replace (space , " " )
@@ -114,7 +124,7 @@ def normalize_text(text: str) -> str:
114124 for quote in single_quotes :
115125 text = text .replace (quote , "'" )
116126
117- double_quotes = ['"' , '"' , "„" , "‟" , "«" , "»" , "〝" , "〞" , "〟" , """ ]
127+ double_quotes = ['"' , " \u201c " , " \u201d " , "„" , "‟" , "«" , "»" , "〝" , "〞" , "〟" , """ ]
118128 for quote in double_quotes :
119129 text = text .replace (quote , '"' )
120130
@@ -132,23 +142,30 @@ def normalize_text(text: str) -> str:
132142def extract_numbers (text : str ) -> set :
133143 """
134144 Extract all numeric values from text.
135-
145+
136146 Args:
137147 text: Input text
138-
148+
139149 Returns:
140- Set of numeric strings found
150+ Set of numeric strings found (includes both decimals and their integer parts)
141151 """
142- return set (re .findall (r"\d+\.?\d*" , text ))
152+ matches = re .findall (r"\d+\.?\d*" , text )
153+ result = set (matches )
154+ for m in matches :
155+ if "." in m :
156+ int_part = m .split ("." )[0 ]
157+ if int_part :
158+ result .add (int_part )
159+ return result
143160
144161
145162def strip_to_alphanumeric (text : str ) -> str :
146163 """
147164 Strip text to alphanumeric characters only.
148-
165+
149166 Args:
150167 text: Input text
151-
168+
152169 Returns:
153170 Lowercase text with only alphanumeric chars and spaces
154171 """
0 commit comments