3636import org .exist .xquery .XPathException ;
3737import org .exist .xquery .XQueryContext ;
3838import org .exist .xquery .value .Base64BinaryValueType ;
39- import org .exist .xquery .value .BinaryValueFromInputStream ;
39+ import org .exist .xquery .value .BinaryValueFromBinaryString ;
4040import org .exist .xquery .value .FunctionParameterSequenceType ;
4141import org .exist .xquery .value .FunctionReturnSequenceType ;
4242import org .exist .xquery .value .Sequence ;
@@ -88,6 +88,24 @@ public class FileIO extends BasicFunction {
8888 new SequenceType []{FILE_PARAM , ENCODING_PARAM },
8989 new FunctionReturnSequenceType (Type .STRING , Cardinality .ZERO_OR_MORE , "the lines of the file." )
9090 ),
91+ // file:read-text($file as xs:string, $encoding as xs:string?, $fallback as xs:boolean) as xs:string
92+ new FunctionSignature (
93+ new QName ("read-text" , ExpathFileModule .NAMESPACE_URI , ExpathFileModule .PREFIX ),
94+ "Reads the contents of a file as text. If $fallback is true, invalid characters are replaced." ,
95+ new SequenceType []{FILE_PARAM , ENCODING_PARAM ,
96+ new FunctionParameterSequenceType ("fallback" , Type .BOOLEAN , Cardinality .EXACTLY_ONE ,
97+ "If true, replace invalid characters with the Unicode replacement character." )},
98+ new FunctionReturnSequenceType (Type .STRING , Cardinality .EXACTLY_ONE , "the file contents as string." )
99+ ),
100+ // file:read-text-lines($file as xs:string, $encoding as xs:string?, $fallback as xs:boolean) as xs:string*
101+ new FunctionSignature (
102+ new QName ("read-text-lines" , ExpathFileModule .NAMESPACE_URI , ExpathFileModule .PREFIX ),
103+ "Reads the contents of a file as a sequence of lines. If $fallback is true, invalid characters are replaced." ,
104+ new SequenceType []{FILE_PARAM , ENCODING_PARAM ,
105+ new FunctionParameterSequenceType ("fallback" , Type .BOOLEAN , Cardinality .EXACTLY_ONE ,
106+ "If true, replace invalid characters with the Unicode replacement character." )},
107+ new FunctionReturnSequenceType (Type .STRING , Cardinality .ZERO_OR_MORE , "the lines of the file." )
108+ ),
91109 // file:read-binary($file as xs:string) as xs:base64Binary
92110 new FunctionSignature (
93111 new QName ("read-binary" , ExpathFileModule .NAMESPACE_URI , ExpathFileModule .PREFIX ),
@@ -130,7 +148,7 @@ public Sequence eval(final Sequence[] args, final Sequence contextSequence) thro
130148 ExpathFileModuleHelper .checkDbaRole (context , this );
131149
132150 final String pathStr = args [0 ].getStringValue ();
133- final Path path = ExpathFileModuleHelper .getPath (pathStr , this );
151+ final Path path = ExpathFileModuleHelper .getPath (pathStr , this , context );
134152
135153 if (!Files .exists (path )) {
136154 throw new XPathException (this , ExpathFileErrorCode .NOT_FOUND ,
@@ -154,20 +172,27 @@ public Sequence eval(final Sequence[] args, final Sequence contextSequence) thro
154172
155173 private Sequence readText (final Path path , final Sequence [] args ) throws XPathException {
156174 final Charset encoding = getEncoding (args , 1 );
175+ final boolean fallback = args .length > 2 && !args [2 ].isEmpty ()
176+ && args [2 ].itemAt (0 ).toJavaObject (Boolean .class );
157177 try {
158- final String content = Files . readString (path , encoding );
178+ final String content = readFileText (path , encoding , fallback );
159179 // Normalize newlines per spec: CR or CRLF -> LF
160180 final String normalized = content .replace ("\r \n " , "\n " ).replace ("\r " , "\n " );
161181 return new StringValue (this , normalized );
182+ } catch (final java .nio .charset .MalformedInputException e ) {
183+ throw new XPathException (this , ExpathFileErrorCode .IO_ERROR ,
184+ "Invalid characters in file for encoding " + encoding .name ());
162185 } catch (final IOException e ) {
163186 throw new XPathException (this , ExpathFileErrorCode .IO_ERROR , e .getMessage ());
164187 }
165188 }
166189
167190 private Sequence readTextLines (final Path path , final Sequence [] args ) throws XPathException {
168191 final Charset encoding = getEncoding (args , 1 );
192+ final boolean fallback = args .length > 2 && !args [2 ].isEmpty ()
193+ && args [2 ].itemAt (0 ).toJavaObject (Boolean .class );
169194 try {
170- final String content = Files . readString (path , encoding );
195+ final String content = readFileText (path , encoding , fallback );
171196 // Split at newline boundaries per spec
172197 final String [] lines = content .split ("\r \n |\r |\n " , -1 );
173198 final ValueSequence result = new ValueSequence (lines .length );
@@ -177,48 +202,116 @@ private Sequence readTextLines(final Path path, final Sequence[] args) throws XP
177202 result .add (new StringValue (this , lines [i ]));
178203 }
179204 return result ;
205+ } catch (final java .nio .charset .MalformedInputException e ) {
206+ throw new XPathException (this , ExpathFileErrorCode .IO_ERROR ,
207+ "Invalid characters in file for encoding " + encoding .name ());
180208 } catch (final IOException e ) {
181209 throw new XPathException (this , ExpathFileErrorCode .IO_ERROR , e .getMessage ());
182210 }
183211 }
184212
185213 private Sequence readBinary (final Path path , final Sequence [] args ) throws XPathException {
186214 final long offset = args .length > 1 && !args [1 ].isEmpty () ? args [1 ].itemAt (0 ).toJavaObject (Long .class ) : 0 ;
187- final long length = args .length > 2 && !args [2 ].isEmpty () ? args [2 ].itemAt (0 ).toJavaObject (Long .class ) : -1 ;
215+ final boolean hasLength = args .length > 2 && !args [2 ].isEmpty ();
216+ final long length = hasLength ? args [2 ].itemAt (0 ).toJavaObject (Long .class ) : -1 ;
188217
189218 try {
190219 final long fileSize = Files .size (path );
191- if (offset < 0 || offset > fileSize ) {
192- throw new XPathException (this , ExpathFileErrorCode .OUT_OF_RANGE ,
193- "Offset " + offset + " is out of range for file of size " + fileSize );
194- }
195- if (length < -1 ) {
196- throw new XPathException (this , ExpathFileErrorCode .OUT_OF_RANGE ,
197- "Length must not be negative: " + length );
198- }
199- if (length >= 0 && offset + length > fileSize ) {
200- throw new XPathException (this , ExpathFileErrorCode .OUT_OF_RANGE ,
201- "Offset + length exceeds file size: " + (offset + length ) + " > " + fileSize );
202- }
220+ validateBinaryRange (offset , length , hasLength , fileSize );
221+
222+ final byte [] data = readBinaryData (path , offset , hasLength , length , fileSize );
223+ final String base64 = java .util .Base64 .getEncoder ().encodeToString (data );
224+ return new BinaryValueFromBinaryString (this , new Base64BinaryValueType (), base64 );
225+ } catch (final IOException e ) {
226+ throw new XPathException (this , ExpathFileErrorCode .IO_ERROR , e .getMessage ());
227+ }
228+ }
203229
204- if (offset == 0 && length < 0 ) {
205- // Read entire file
206- final InputStream is = Files .newInputStream (path );
207- return BinaryValueFromInputStream .getInstance (context , new Base64BinaryValueType (), is , this );
230+ private void validateBinaryRange (final long offset , final long length , final boolean hasLength , final long fileSize ) throws XPathException {
231+ if (offset < 0 || offset > fileSize ) {
232+ throw new XPathException (this , ExpathFileErrorCode .OUT_OF_RANGE ,
233+ "Offset " + offset + " is out of range for file of size " + fileSize );
234+ }
235+ if (hasLength && length < 0 ) {
236+ throw new XPathException (this , ExpathFileErrorCode .OUT_OF_RANGE ,
237+ "Length must not be negative: " + length );
238+ }
239+ if (hasLength && offset + length > fileSize ) {
240+ throw new XPathException (this , ExpathFileErrorCode .OUT_OF_RANGE ,
241+ "Offset + length exceeds file size: " + (offset + length ) + " > " + fileSize );
242+ }
243+ }
244+
245+ private byte [] readBinaryData (final Path path , final long offset , final boolean hasLength , final long length , final long fileSize ) throws IOException {
246+ if (offset == 0 && !hasLength ) {
247+ return Files .readAllBytes (path );
248+ }
249+ try (final RandomAccessFile raf = new RandomAccessFile (path .toFile (), "r" )) {
250+ raf .seek (offset );
251+ final int readLen = hasLength ? (int ) length : (int ) (fileSize - offset );
252+ final byte [] data = new byte [readLen ];
253+ raf .readFully (data );
254+ return data ;
255+ }
256+ }
257+
258+ /**
259+ * Reads a file as text with the given encoding.
260+ * If fallback is true, malformed byte sequences and XML-illegal characters
261+ * are replaced with U+FFFD. Otherwise, an IOException is thrown if the file
262+ * contains malformed bytes or XML-illegal characters.
263+ */
264+ private String readFileText (final Path path , final Charset encoding , final boolean fallback ) throws IOException {
265+ final String content ;
266+ if (fallback ) {
267+ final java .nio .charset .CharsetDecoder decoder = encoding .newDecoder ()
268+ .onMalformedInput (java .nio .charset .CodingErrorAction .REPLACE )
269+ .onUnmappableCharacter (java .nio .charset .CodingErrorAction .REPLACE )
270+ .replaceWith ("\uFFFD " );
271+ final byte [] bytes = Files .readAllBytes (path );
272+ content = decoder .decode (java .nio .ByteBuffer .wrap (bytes )).toString ();
273+ // Replace XML-illegal characters with U+FFFD
274+ return replaceXmlIllegalChars (content );
275+ } else {
276+ content = Files .readString (path , encoding );
277+ // Check for XML-illegal characters
278+ checkXmlIllegalChars (content );
279+ return content ;
280+ }
281+ }
282+
283+ /**
284+ * Check if a string contains characters illegal in XML 1.0 and throw IOException if so.
285+ * XML 1.0 allows: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
286+ */
287+ private void checkXmlIllegalChars (final String text ) throws IOException {
288+ for (int i = 0 ; i < text .length (); i ++) {
289+ final char c = text .charAt (i );
290+ if (c < 0x20 && c != 0x9 && c != 0xA && c != 0xD ) {
291+ throw new IOException ("File contains XML-illegal character U+" +
292+ String .format ("%04X" , (int ) c ) + " at position " + i );
293+ }
294+ if (c >= 0xFFFE ) {
295+ throw new IOException ("File contains XML-illegal character U+" +
296+ String .format ("%04X" , (int ) c ) + " at position " + i );
208297 }
298+ }
299+ }
209300
210- // Partial read
211- try (final RandomAccessFile raf = new RandomAccessFile (path .toFile (), "r" )) {
212- raf .seek (offset );
213- final int readLen = length >= 0 ? (int ) length : (int ) (fileSize - offset );
214- final byte [] data = new byte [readLen ];
215- raf .readFully (data );
216- final InputStream bis = new java .io .ByteArrayInputStream (data );
217- return BinaryValueFromInputStream .getInstance (context , new Base64BinaryValueType (), bis , this );
301+ /**
302+ * Replace characters illegal in XML 1.0 with U+FFFD.
303+ */
304+ private String replaceXmlIllegalChars (final String text ) {
305+ final StringBuilder sb = new StringBuilder (text .length ());
306+ for (int i = 0 ; i < text .length (); i ++) {
307+ final char c = text .charAt (i );
308+ if ((c < 0x20 && c != 0x9 && c != 0xA && c != 0xD ) || c >= 0xFFFE ) {
309+ sb .append ('\uFFFD' );
310+ } else {
311+ sb .append (c );
218312 }
219- } catch (final IOException e ) {
220- throw new XPathException (this , ExpathFileErrorCode .IO_ERROR , e .getMessage ());
221313 }
314+ return sb .toString ();
222315 }
223316
224317 private Charset getEncoding (final Sequence [] args , final int index ) throws XPathException {
0 commit comments