|
58 | 58 | modelbusy = threading.Lock() |
59 | 59 | requestsinqueue = 0 |
60 | 60 | defaultport = 5001 |
61 | | -KcppVersion = "1.80.3" |
| 61 | +KcppVersion = "1.81" |
62 | 62 | showdebug = True |
63 | 63 | guimode = False |
64 | 64 | showsamplerwarning = True |
@@ -1310,6 +1310,11 @@ def fetch_webpages_parallel(urls): |
1310 | 1310 | results = list(executor.map(fetch_searched_webpage, urls)) |
1311 | 1311 | return results |
1312 | 1312 |
|
| 1313 | + def normalize_page_text(text): |
| 1314 | + text = re.sub(r'\s+([.,!?])', r'\1', text) # Remove spaces before punctuation |
| 1315 | + text = re.sub(r'([.,!?])([^\s])', r'\1 \2', text) # Ensure a single space follows punctuation, if not at the end of a line |
| 1316 | + return text |
| 1317 | + |
1313 | 1318 | class VisibleTextParser(HTMLParser): |
1314 | 1319 | def __init__(self): |
1315 | 1320 | super().__init__() |
@@ -1393,12 +1398,14 @@ def handle_data(self, data): |
1393 | 1398 | parser2 = VisibleTextParser() |
1394 | 1399 | parser2.feed(html_content) |
1395 | 1400 | scraped = parser2.get_text().strip() |
| 1401 | + scraped = normalize_page_text(scraped) |
| 1402 | + desc = normalize_page_text(desc) |
1396 | 1403 | s = difflib.SequenceMatcher(None, scraped.lower(), desc.lower(), autojunk=False) |
1397 | 1404 | matches = s.find_longest_match(0, len(scraped), 0, desclen) |
1398 | 1405 | if matches.size > 100 and desclen-matches.size < 100: #good enough match |
1399 | 1406 | # expand description by some chars both sides |
1400 | | - expandamtbefore = 250 |
1401 | | - expandamtafter = 750 |
| 1407 | + expandamtbefore = 200 |
| 1408 | + expandamtafter = 800 |
1402 | 1409 | startpt = matches.a - expandamtbefore |
1403 | 1410 | startpt = 0 if startpt < 0 else startpt |
1404 | 1411 | endpt = matches.a + expandamtafter + desclen |
|
0 commit comments