@@ -104,7 +104,7 @@ public static void SkipIfLegacyOfficeDisabled(string relativePath)
104104
105105 public static void SkipIfOfficeTestOnWindows ( string relativePath )
106106 {
107- // Office tests timeout on Windows due to LibreOffice conversion delays
107+ // Office tests timeout on Windows
108108 if ( OperatingSystem . IsWindows ( ) )
109109 {
110110 var ext = Path . GetExtension ( relativePath ) . ToLowerInvariant ( ) ;
@@ -116,6 +116,15 @@ public static void SkipIfOfficeTestOnWindows(string relativePath)
116116 }
117117 }
118118
119+ public static void SkipIfPaddleOcrUnavailable ( )
120+ {
121+ var flag = Environment . GetEnvironmentVariable ( "KREUZBERG_PADDLE_OCR_AVAILABLE" ) ;
122+ if ( string . IsNullOrWhiteSpace ( flag ) || flag == "0" || flag . Equals ( "false" , StringComparison . OrdinalIgnoreCase ) )
123+ {
124+ throw new Xunit . SkipException ( ) ;
125+ }
126+ }
127+
119128 public static ExtractionConfig ? BuildConfig ( string ? configJson )
120129 {
121130 if ( string . IsNullOrWhiteSpace ( configJson ) )
@@ -562,145 +571,95 @@ public static void AssertElements(
562571 public static void AssertOcrElements (
563572 ExtractionResult result ,
564573 bool ? hasElements ,
565- bool ? elementsHaveGeometry ,
566- bool ? elementsHaveConfidence ,
574+ bool ? hasGeometry ,
575+ bool ? hasConfidence ,
567576 int ? minCount )
568577 {
569578 var ocrElements = result . OcrElements ;
570579 if ( hasElements == true )
571580 {
572- if ( ocrElements is null )
573- {
574- throw new XunitException ( "Expected ocr_elements but got null" ) ;
575- }
576- if ( ocrElements . Count == 0 )
581+ if ( ocrElements is null || ocrElements . Count == 0 )
577582 {
578- throw new XunitException ( "Expected ocr_elements to be non-empty " ) ;
583+ throw new XunitException ( "Expected OCR elements but none found " ) ;
579584 }
580585 }
581- if ( ocrElements is not null && ocrElements . Count > 0 )
586+ if ( ocrElements is not null )
582587 {
583- if ( minCount . HasValue && ocrElements . Count < minCount . Value )
584- {
585- throw new XunitException ( $ "Expected at least { minCount . Value } ocr_elements, found { ocrElements . Count } ") ;
586- }
587- if ( elementsHaveGeometry == true )
588+ if ( hasGeometry == true )
588589 {
589590 for ( var i = 0 ; i < ocrElements . Count ; i ++ )
590591 {
591- var el = ocrElements [ i ] ;
592- if ( el . Geometry is null )
592+ if ( ocrElements [ i ] . Geometry is null )
593593 {
594- throw new XunitException ( $ "OCR element { i } has no geometry") ;
595- }
596- var geomType = el . Geometry . Type . ToString ( ) . ToLowerInvariant ( ) ;
597- if ( geomType != "rectangle" && geomType != "quadrilateral" )
598- {
599- throw new XunitException ( $ "OCR element { i } has invalid geometry type: { geomType } ") ;
594+ throw new XunitException ( $ "OCR element { i } expected to have geometry") ;
600595 }
601596 }
602597 }
603- if ( elementsHaveConfidence == true )
598+ if ( hasConfidence == true )
604599 {
605600 for ( var i = 0 ; i < ocrElements . Count ; i ++ )
606601 {
607- var el = ocrElements [ i ] ;
608- if ( el . Confidence is null )
602+ if ( ocrElements [ i ] . Confidence is null )
609603 {
610- throw new XunitException ( $ "OCR element { i } has no confidence") ;
611- }
612- if ( el . Confidence . Recognition <= 0 )
613- {
614- throw new XunitException ( $ "OCR element { i } has invalid confidence recognition: { el . Confidence . Recognition } ") ;
604+ throw new XunitException ( $ "OCR element { i } expected to have confidence score") ;
615605 }
616606 }
617607 }
608+ if ( minCount . HasValue && ocrElements . Count < minCount . Value )
609+ {
610+ throw new XunitException ( $ "Expected at least { minCount . Value } OCR elements, found { ocrElements . Count } ") ;
611+ }
618612 }
619613 }
620614
621615 public static void AssertDocument (
622616 ExtractionResult result ,
623- bool hasDocument = false ,
624- int ? minNodeCount = null ,
625- IEnumerable < string > ? nodeTypesInclude = null ,
626- bool ? hasGroups = null )
617+ bool hasDocument ,
618+ int ? minNodeCount ,
619+ IEnumerable < string > ? nodeTypesInclude ,
620+ bool ? hasGroups )
627621 {
628622 var document = result . Document ;
629- if ( ! hasDocument )
623+ if ( hasDocument )
630624 {
631- if ( document is not null )
625+ if ( document is null )
632626 {
633- throw new XunitException ( $ "Expected document to be null but got { document . GetType ( ) } ") ;
627+ throw new XunitException ( "Expected document but got null " ) ;
634628 }
635- return ;
636- }
637- if ( document is null )
638- {
639- throw new XunitException ( "Expected document but got null" ) ;
640- }
641-
642- // Extract nodes from document structure
643- List < DocumentNode > ? nodes = null ;
644- if ( document is DocumentStructure docStruct )
645- {
646- nodes = docStruct . Nodes ;
647- }
648- else
649- {
650- throw new XunitException ( $ "Expected DocumentStructure but got { document . GetType ( ) } ") ;
651- }
652-
653- if ( nodes is null )
654- {
655- throw new XunitException ( "Expected document.nodes but got null" ) ;
656- }
657-
658- if ( minNodeCount . HasValue && nodes . Count < minNodeCount . Value )
659- {
660- throw new XunitException ( $ "Expected at least { minNodeCount . Value } nodes, found { nodes . Count } ") ;
661- }
662-
663- if ( nodeTypesInclude is not null )
664- {
665- var foundTypes = new HashSet < string > ( ) ;
666- foreach ( var node in nodes )
629+ var nodes = document . Nodes ;
630+ if ( nodes is null )
667631 {
668- string ? nodeType = node . Content ? . NodeType ;
669- if ( ! string . IsNullOrEmpty ( nodeType ) )
670- {
671- foundTypes . Add ( nodeType ) ;
672- }
632+ throw new XunitException ( "Expected document nodes but got null" ) ;
673633 }
674-
675- foreach ( var expectedType in nodeTypesInclude )
634+ if ( minNodeCount . HasValue && nodes . Count < minNodeCount . Value )
676635 {
677- if ( ! foundTypes . Contains ( expectedType ) )
678- {
679- throw new XunitException ( $ "Expected node type '{ expectedType } ' not found in [{ string . Join ( ", " , foundTypes ) } ]") ;
680- }
636+ throw new XunitException ( $ "Expected at least { minNodeCount . Value } nodes, found { nodes . Count } ") ;
681637 }
682- }
683-
684- if ( hasGroups . HasValue )
685- {
686- bool hasGroupNodes = false ;
687- foreach ( var node in nodes )
638+ if ( nodeTypesInclude is not null )
688639 {
689- string ? nodeType = node . Content ? . NodeType ;
690- if ( nodeType == "group" )
640+ var foundTypes = nodes . Select ( n => n . Content ? . NodeType ?? "" ) . ToHashSet ( ) ;
641+ foreach ( var expected in nodeTypesInclude )
691642 {
692- hasGroupNodes = true ;
693- break ;
643+ if ( ! foundTypes . Any ( t => string . Equals ( t , expected , StringComparison . OrdinalIgnoreCase ) ) )
644+ {
645+ throw new XunitException ( $ "Expected node type '{ expected } ' not found in [{ string . Join ( ", " , foundTypes ) } ]") ;
646+ }
694647 }
695648 }
696-
697- if ( hasGroups . Value && ! hasGroupNodes )
649+ if ( hasGroups . HasValue )
698650 {
699- throw new XunitException ( "Expected document to have group nodes but found none" ) ;
651+ var hasGroupNodes = nodes . Any ( n => string . Equals ( n . Content ? . NodeType , "group" , StringComparison . OrdinalIgnoreCase ) ) ;
652+ if ( hasGroupNodes != hasGroups . Value )
653+ {
654+ throw new XunitException ( $ "Expected hasGroups={ hasGroups . Value } but got { hasGroupNodes } ") ;
655+ }
700656 }
701- if ( ! hasGroups . Value && hasGroupNodes )
657+ }
658+ else
659+ {
660+ if ( document is not null )
702661 {
703- throw new XunitException ( "Expected document to not have group nodes but found some " ) ;
662+ throw new XunitException ( $ "Expected document to be null but got a document ") ;
704663 }
705664 }
706665 }
0 commit comments