|
9 | 9 | # Citation detection patterns |
10 | 10 | PATTERNS: Dict[str, Pattern] = { |
11 | 11 | # Cases: Party v. Party, Volume Reporter Page (Court Year) |
| 12 | + # Fixed to not be greedy with whitespace - party names can't span multiple lines |
12 | 13 | "case_complete": re.compile( |
13 | | - r"([A-Z][a-zA-Z\.\'\-\s]+)\s+v\.\s+([A-Z][a-zA-Z\.\'\-\s]+),\s*" |
14 | | - r"(\d+)\s+([A-Z][a-zA-Z\.\s\d]+)\s+(\d+)" |
| 14 | + r"([A-Z][a-zA-Z\.\'\-]+(?:\s+[A-Z][a-zA-Z\.\'\-]+)*(?:,?\s+(?:LLC|Inc\.|Corp\.|Co\.|Ltd\.))?)\s+v\.\s+" |
| 15 | + r"([A-Z][a-zA-Z\.\'\-]+(?:\s+[A-Z][a-zA-Z\.\'\-]+)*(?:,?\s+(?:LLC|Inc\.|Corp\.|Co\.|Ltd\.))?),\s*" |
| 16 | + r"(\d+)\s+([A-Z][a-zA-Z\.\s\d]+?)\s+(\d+)" |
15 | 17 | r"(?:,\s*(\d+(?:-\d+)?))?\s*" |
16 | 18 | r"\(([^)]+)\)" |
17 | 19 | ), |
18 | | - |
| 20 | + |
19 | 21 | # Incomplete case: just Party v. Party (missing reporter info) |
| 22 | + # Fixed to not be greedy - requires capitalized words only |
20 | 23 | "case_incomplete": re.compile( |
21 | | - r"([A-Z][a-zA-Z\.\'\-\s]+)\s+v\.\s+([A-Z][a-zA-Z\.\'\-\s]+)" |
| 24 | + r"([A-Z][a-zA-Z\.\'\-]+(?:\s+[A-Z][a-zA-Z\.\'\-]+)*)\s+v\.\s+" |
| 25 | + r"([A-Z][a-zA-Z\.\'\-]+(?:\s+[A-Z][a-zA-Z\.\'\-]+)*)" |
22 | 26 | r"(?!\s*,\s*\d+\s+[A-Z])" |
23 | 27 | ), |
24 | 28 |
|
|
48 | 52 | ), |
49 | 53 |
|
50 | 54 | # Books: Author, Title (Edition Year) |
| 55 | + # Fixed to capture full edition like "6th" not just "6" |
51 | 56 | "book": re.compile( |
52 | 57 | r"([A-Z][a-zA-Z\.\s]+),\s+" |
53 | 58 | r"([A-Z][^(]+)\s*" |
54 | | - r"\((?:(\d+)(?:st|nd|rd|th)\s+ed\.\s+)?(\d{4})\)" |
| 59 | + r"\((?:(\d+(?:st|nd|rd|th))\s+ed\.\s+)?(\d{4})\)" |
55 | 60 | ), |
56 | 61 |
|
57 | 62 | # Short forms - Id. |
@@ -452,18 +457,46 @@ def get_journal_abbreviation(journal: str) -> str: |
452 | 457 | """Get the Bluebook abbreviation for a journal.""" |
453 | 458 | return JOURNAL_ABBREVIATIONS.get(journal, journal) |
454 | 459 |
|
455 | | -def abbreviate_party_name(party: str) -> str: |
456 | | - """Abbreviate a party name per Bluebook Table 6.""" |
| 460 | +def abbreviate_party_name(party: str, is_state_party: bool = False) -> str: |
| 461 | + """ |
| 462 | + Abbreviate a party name per Bluebook Table 6. |
| 463 | +
|
| 464 | + Args: |
| 465 | + party: The party name to abbreviate |
| 466 | + is_state_party: If True, don't abbreviate state/geographic names |
| 467 | + (per Rule 10.2.1 - states as parties keep full names) |
| 468 | + """ |
457 | 469 | result = party |
458 | | - |
| 470 | + |
459 | 471 | # Remove "The" at beginning (Rule 10.2.1(e)) |
460 | 472 | if result.lower().startswith("the "): |
461 | 473 | result = result[4:] |
462 | | - |
463 | | - # Apply abbreviations |
| 474 | + |
| 475 | + # Don't abbreviate if it's a state/geographic entity as a party |
| 476 | + # (e.g., "North Carolina" as defendant should stay "North Carolina") |
| 477 | + if is_state_party: |
| 478 | + return result.strip() |
| 479 | + |
| 480 | + # Check if this is a pure state name - if so, don't abbreviate |
| 481 | + if result.strip() in STATE_ABBREVIATIONS: |
| 482 | + return result.strip() |
| 483 | + |
| 484 | + # Words that should NOT be abbreviated in party names |
| 485 | + # (only abbreviate organizational/business terms) |
| 486 | + skip_abbreviations = { |
| 487 | + "North", "South", "East", "West", "Eastern", "Western", |
| 488 | + "Northern", "Southern", "Northeast", "Northwest", |
| 489 | + "Southeast", "Southwest", "Carolina", "Dakota", "Virginia", |
| 490 | + "Hampshire", "Jersey", "Mexico", "York" |
| 491 | + } |
| 492 | + |
| 493 | + # Apply abbreviations only for business/organizational terms |
464 | 494 | for full, abbrev in PARTY_ABBREVIATIONS.items(): |
| 495 | + # Skip geographic terms in case names |
| 496 | + if full in skip_abbreviations: |
| 497 | + continue |
465 | 498 | # Use word boundaries for replacement |
466 | 499 | pattern = re.compile(r'\b' + re.escape(full) + r'\b', re.IGNORECASE) |
467 | 500 | result = pattern.sub(abbrev, result) |
468 | | - |
| 501 | + |
469 | 502 | return result.strip() |
0 commit comments