|
9 | 9 | # Citation detection patterns |
10 | 10 | PATTERNS: Dict[str, Pattern] = { |
11 | 11 | # Cases: Party v. Party, Volume Reporter Page (Court Year) |
| 12 | + # Party names: capitalized words, can include LLC/Inc/Corp, no greedy whitespace |
12 | 13 | "case_complete": re.compile( |
13 | | - r"([A-Z][a-zA-Z\.\'\-\s]+)\s+v\.\s+([A-Z][a-zA-Z\.\'\-\s]+),\s*" |
14 | | - r"(\d+)\s+([A-Z][a-zA-Z\.\s\d]+)\s+(\d+)" |
| 14 | + r"([A-Z][a-zA-Z\.\'\-]+(?:\s+[A-Za-z\.\'\-]+){0,5})\s+v\.\s+" |
| 15 | + r"([A-Z][a-zA-Z\.\'\-]+(?:\s+[A-Za-z\.\'\-]+){0,5}),\s*" |
| 16 | + r"(\d+)\s+([A-Z][a-zA-Z\.\s\d]+?)\s+(\d+)" |
15 | 17 | r"(?:,\s*(\d+(?:-\d+)?))?\s*" |
16 | 18 | r"\(([^)]+)\)" |
17 | 19 | ), |
18 | | - |
| 20 | + |
19 | 21 | # Incomplete case: just Party v. Party (missing reporter info) |
| 22 | + # Limited to reasonable party name length (max 5 words per party) |
20 | 23 | "case_incomplete": re.compile( |
21 | | - r"([A-Z][a-zA-Z\.\'\-\s]+)\s+v\.\s+([A-Z][a-zA-Z\.\'\-\s]+)" |
| 24 | + r"([A-Z][a-zA-Z\.\'\-]+(?:\s+[A-Za-z\.\'\-]+){0,5})\s+v\.\s+" |
| 25 | + r"([A-Z][a-zA-Z\.\'\-]+(?:\s+[A-Za-z\.\'\-]+){0,5})" |
22 | 26 | r"(?!\s*,\s*\d+\s+[A-Z])" |
23 | 27 | ), |
24 | 28 |
|
|
47 | 51 | r"\((\d{4})\)" |
48 | 52 | ), |
49 | 53 |
|
50 | | - # Books: Author, Title (Edition Year) |
| 54 | + # Books: Author, Title (Edition Year) - captures full ordinal like "6th" |
51 | 55 | "book": re.compile( |
52 | 56 | r"([A-Z][a-zA-Z\.\s]+),\s+" |
53 | 57 | r"([A-Z][^(]+)\s*" |
54 | | - r"\((?:(\d+)(?:st|nd|rd|th)\s+ed\.\s+)?(\d{4})\)" |
| 58 | + r"\((?:(\d+(?:st|nd|rd|th))\s+ed\.\s+)?(\d{4})\)" |
55 | 59 | ), |
56 | 60 |
|
57 | 61 | # Short forms - Id. |
|
0 commit comments