@@ -17,8 +17,6 @@ export class DocParser implements FileParser {
1717 throw new Error ( `File not found: ${ filePath } ` )
1818 }
1919
20- logger . info ( `Parsing DOC file: ${ filePath } ` )
21-
2220 const buffer = await readFile ( filePath )
2321 return this . parseBuffer ( buffer )
2422 } catch ( error ) {
@@ -29,53 +27,80 @@ export class DocParser implements FileParser {
2927
3028 async parseBuffer ( buffer : Buffer ) : Promise < FileParseResult > {
3129 try {
32- logger . info ( 'Parsing DOC buffer, size:' , buffer . length )
33-
3430 if ( ! buffer || buffer . length === 0 ) {
3531 throw new Error ( 'Empty buffer provided' )
3632 }
3733
38- let parseOfficeAsync
3934 try {
4035 const officeParser = await import ( 'officeparser' )
41- parseOfficeAsync = officeParser . parseOfficeAsync
42- } catch ( importError ) {
43- logger . warn ( 'officeparser not available, using fallback extraction' )
44- return this . fallbackExtraction ( buffer )
36+ const result = await officeParser . parseOfficeAsync ( buffer )
37+
38+ if ( result ) {
39+ const resultString = typeof result === 'string' ? result : String ( result )
40+ const content = sanitizeTextForUTF8 ( resultString . trim ( ) )
41+
42+ if ( content . length > 0 ) {
43+ return {
44+ content,
45+ metadata : {
46+ characterCount : content . length ,
47+ extractionMethod : 'officeparser' ,
48+ } ,
49+ }
50+ }
51+ }
52+ } catch ( officeError ) {
53+ logger . warn ( 'officeparser failed, trying mammoth:' , officeError )
4554 }
4655
4756 try {
48- const result = await parseOfficeAsync ( buffer )
49-
50- if ( ! result ) {
51- throw new Error ( 'officeparser returned no result' )
57+ const mammoth = await import ( 'mammoth' )
58+ const result = await mammoth . extractRawText ( { buffer } )
59+
60+ if ( result . value && result . value . trim ( ) . length > 0 ) {
61+ const content = sanitizeTextForUTF8 ( result . value . trim ( ) )
62+ return {
63+ content,
64+ metadata : {
65+ characterCount : content . length ,
66+ extractionMethod : 'mammoth' ,
67+ messages : result . messages ,
68+ } ,
69+ }
5270 }
53-
54- const resultString = typeof result === 'string' ? result : String ( result )
55-
56- const content = sanitizeTextForUTF8 ( resultString . trim ( ) )
57-
58- logger . info ( 'DOC parsing completed successfully with officeparser' )
59-
60- return {
61- content : content ,
62- metadata : {
63- characterCount : content . length ,
64- extractionMethod : 'officeparser' ,
65- } ,
66- }
67- } catch ( extractError ) {
68- logger . warn ( 'officeparser failed, using fallback:' , extractError )
69- return this . fallbackExtraction ( buffer )
71+ } catch ( mammothError ) {
72+ logger . warn ( 'mammoth failed:' , mammothError )
7073 }
74+
75+ return this . fallbackExtraction ( buffer )
7176 } catch ( error ) {
72- logger . error ( 'DOC buffer parsing error:' , error )
77+ logger . error ( 'DOC parsing error:' , error )
7378 throw new Error ( `Failed to parse DOC buffer: ${ ( error as Error ) . message } ` )
7479 }
7580 }
7681
7782 private fallbackExtraction ( buffer : Buffer ) : FileParseResult {
78- logger . info ( 'Using fallback text extraction for DOC file' )
83+ const isBinaryDoc = buffer . length >= 2 && buffer [ 0 ] === 0xd0 && buffer [ 1 ] === 0xcf
84+
85+ if ( ! isBinaryDoc ) {
86+ const textContent = buffer . toString ( 'utf8' ) . trim ( )
87+
88+ if ( textContent . length > 0 ) {
89+ const printableChars = textContent . match ( / [ \x20 - \x7E \n \r \t ] / g) ?. length || 0
90+ const isProbablyText = printableChars / textContent . length > 0.9
91+
92+ if ( isProbablyText ) {
93+ return {
94+ content : sanitizeTextForUTF8 ( textContent ) ,
95+ metadata : {
96+ extractionMethod : 'plaintext-fallback' ,
97+ characterCount : textContent . length ,
98+ warning : 'File is not a valid DOC format, extracted as plain text' ,
99+ } ,
100+ }
101+ }
102+ }
103+ }
79104
80105 const text = buffer . toString ( 'utf8' , 0 , Math . min ( buffer . length , 100000 ) )
81106
0 commit comments