Skip to content

Commit 37a647a

Browse files
committed
feat: add length unit support in FileSystem limits
Different filesystems and operating systems measure file and path lengths in different units: * macOS and Windows filesystems typically count **UTF-16 code units**. * Linux and other UNIX filesystems typically count **bytes**. This change introduces explicit unit support so these limits can be interpreted consistently. ### Key changes * **New API** * Added a `LengthUnit` enum and `FileSystem.getLengthUnit()` to expose the unit of measure used by `getMaxFileNameLength()` and `getMaxPathLength()`. * Added new overloads for `isLegalFileName` and `toLegalFileName` that accept a `Charset`, making conversions between bytes and UTF-16 explicit. * **Adjusted defaults** * Reduced the `GENERIC` filesystem defaults: * File name length → **1020 bytes** (covers 255 UTF-16 characters encoded as up to 3 UTF-8 bytes). * Path length → **1 MiB** (covers 32,767 UTF-16 code units, again at 3 UTF-8 bytes each). * **Testing** * Added unit tests to validate the new API and updated limits.
1 parent e205fb9 commit 37a647a

File tree

3 files changed

+339
-34
lines changed

3 files changed

+339
-34
lines changed

src/changes/changes.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ The <action> type attribute can be add,update,fix,remove.
5757
<action dev="ggregory" type="add" due-to="Gary Gregory">Add org.apache.commons.io.output.ProxyOutputStream.writeRepeat(byte[], int, int, long).</action>
5858
<action dev="ggregory" type="add" due-to="Gary Gregory">Add org.apache.commons.io.output.ProxyOutputStream.writeRepeat(byte[], long).</action>
5959
<action dev="ggregory" type="add" due-to="Gary Gregory">Add org.apache.commons.io.output.ProxyOutputStream.writeRepeat(int, long).</action>
60+
<action dev="pkarwasz" type="add" due-to="Piotr P. Karwasz">Add length unit support in FileSystem limits.</action>
6061
<!-- UPDATE -->
6162
<action type="update" dev="ggregory" due-to="Gary Gregory, Dependabot">Bump org.apache.commons:commons-parent from 85 to 87 #774.</action>
6263
<action type="update" dev="ggregory" due-to="Gary Gregory">[test] Bump commons-codec:commons-codec from 1.18.0 to 1.19.0.</action>

src/main/java/org/apache/commons/io/FileSystem.java

Lines changed: 186 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,13 @@
1717

1818
package org.apache.commons.io;
1919

20+
import java.nio.ByteBuffer;
21+
import java.nio.CharBuffer;
22+
import java.nio.charset.CharacterCodingException;
23+
import java.nio.charset.Charset;
24+
import java.nio.charset.CharsetEncoder;
25+
import java.nio.charset.CoderResult;
26+
import java.nio.charset.CodingErrorAction;
2027
import java.util.Arrays;
2128
import java.util.Locale;
2229
import java.util.Objects;
@@ -36,7 +43,12 @@ public enum FileSystem {
3643
/**
3744
* Generic file system.
3845
*/
39-
GENERIC(4096, false, false, Integer.MAX_VALUE, Integer.MAX_VALUE, new int[] { 0 }, new String[] {}, false, false, '/'),
46+
GENERIC(4096, false, false, 1020, 1024 * 1024, new int[] {
47+
// @formatter:off
48+
// ASCII NUL
49+
0
50+
// @formatter:on
51+
}, new String[] {}, false, false, '/', LengthUnit.BYTES),
4052

4153
/**
4254
* Linux file system.
@@ -48,7 +60,7 @@ public enum FileSystem {
4860
0,
4961
'/'
5062
// @formatter:on
51-
}, new String[] {}, false, false, '/'),
63+
}, new String[] {}, false, false, '/', LengthUnit.BYTES),
5264

5365
/**
5466
* MacOS file system.
@@ -61,7 +73,7 @@ public enum FileSystem {
6173
'/',
6274
':'
6375
// @formatter:on
64-
}, new String[] {}, false, false, '/'),
76+
}, new String[] {}, false, false, '/', LengthUnit.CHARS),
6577

6678
/**
6779
* Windows file system.
@@ -78,7 +90,7 @@ public enum FileSystem {
7890
*/
7991
// @formatter:off
8092
WINDOWS(4096, false, true,
81-
255, 32000, // KEEP THIS ARRAY SORTED!
93+
255, 32767, // KEEP THIS ARRAY SORTED!
8294
new int[] {
8395
// KEEP THIS ARRAY SORTED!
8496
// ASCII NUL
@@ -95,7 +107,7 @@ public enum FileSystem {
95107
"LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9",
96108
"LPT\u00b2", "LPT\u00b3", "LPT\u00b9", // Superscript 2 3 1 in that order
97109
"NUL", "PRN"
98-
}, true, true, '\\');
110+
}, true, true, '\\', LengthUnit.CHARS);
99111
// @formatter:on
100112

101113
/**
@@ -315,6 +327,7 @@ private static String replace(final String path, final char oldChar, final char
315327
private final boolean supportsDriveLetter;
316328
private final char nameSeparator;
317329
private final char nameSeparatorOther;
330+
private final LengthUnit lengthUnit;
318331

319332
/**
320333
* Constructs a new instance.
@@ -329,10 +342,12 @@ private static String replace(final String path, final char oldChar, final char
329342
* @param reservedFileNamesExtensions TODO
330343
* @param supportsDriveLetter Whether this file system support driver letters.
331344
* @param nameSeparator The name separator, '\\' on Windows, '/' on Linux.
345+
* @param lengthUnit The unit of measurement for length limits.
332346
*/
333347
FileSystem(final int blockSize, final boolean caseSensitive, final boolean casePreserving,
334348
final int maxFileLength, final int maxPathLength, final int[] illegalFileNameChars,
335-
final String[] reservedFileNames, final boolean reservedFileNamesExtensions, final boolean supportsDriveLetter, final char nameSeparator) {
349+
final String[] reservedFileNames, final boolean reservedFileNamesExtensions, final boolean supportsDriveLetter,
350+
final char nameSeparator, final LengthUnit lengthUnit) {
336351
this.blockSize = blockSize;
337352
this.maxFileNameLength = maxFileLength;
338353
this.maxPathLength = maxPathLength;
@@ -345,6 +360,7 @@ private static String replace(final String path, final char oldChar, final char
345360
this.supportsDriveLetter = supportsDriveLetter;
346361
this.nameSeparator = nameSeparator;
347362
this.nameSeparatorOther = FilenameUtils.flipSeparator(nameSeparator);
363+
this.lengthUnit = lengthUnit;
348364
}
349365

350366
/**
@@ -380,23 +396,45 @@ public int[] getIllegalFileNameCodePoints() {
380396
}
381397

382398
/**
383-
* Gets the maximum length for file names. The file name does not include folders.
399+
* Gets the maximum length for file names (excluding any folder path).
400+
*
401+
* <p><strong>Note:</strong> This excludes any folder path. The unit depends on the
402+
* filesystem or OS; see {@link #getLengthUnit()} to check whether the value is in
403+
* bytes or UTF-16 characters.</p>
384404
*
385-
* @return the maximum length for file names.
405+
* @return the maximum file name length.
386406
*/
387407
public int getMaxFileNameLength() {
388408
return maxFileNameLength;
389409
}
390410

391411
/**
392-
* Gets the maximum length of the path to a file. This can include folders.
412+
* Gets the maximum length for file paths (may include folders).
393413
*
394-
* @return the maximum length of the path to a file.
414+
* <p><strong>Note:</strong> This may include folder names as well as the file name.
415+
* The unit is the same as {@link #getMaxFileNameLength()} and can be obtained
416+
* from {@link #getLengthUnit()}.</p>
417+
*
418+
* @return the maximum file path length.
395419
*/
396420
public int getMaxPathLength() {
397421
return maxPathLength;
398422
}
399423

424+
/**
425+
* Gets the unit of measurement for length limits.
426+
*
427+
* <p>Depending on the platform, limits may be expressed in bytes or in UTF-16
428+
* characters.</p>
429+
*
430+
* @return the unit for file name and path length limits.
431+
* @since 2.21.0
432+
*/
433+
public LengthUnit getLengthUnit() {
434+
return lengthUnit;
435+
}
436+
437+
400438
/**
401439
* Gets the name separator, '\\' on Windows, '/' on Linux.
402440
*
@@ -446,16 +484,42 @@ private boolean isIllegalFileNameChar(final int c) {
446484
}
447485

448486
/**
449-
* Tests if a candidate file name (without a path) such as {@code "filename.ext"} or {@code "filename"} is a
450-
* potentially legal file name. If the file name length exceeds {@link #getMaxFileNameLength()}, or if it contains
451-
* an illegal character then the check fails.
487+
* Tests if a candidate file name (without a path) is a legal file name.
488+
*
489+
* <p>Takes a file name like {@code "filename.ext"} or {@code "filename"} and checks:</p>
490+
* <ul>
491+
* <li>if the file name length is legal</li>
492+
* <li>if the file name is not a reserved file name</li>
493+
* <li>if the file name does not contain illegal characters</li>
494+
* </ul>
452495
*
453496
* @param candidate
454-
* a candidate file name (without a path) like {@code "filename.ext"} or {@code "filename"}
497+
* A candidate file name (without a path) like {@code "filename.ext"} or {@code "filename"}
455498
* @return {@code true} if the candidate name is legal
456499
*/
457500
public boolean isLegalFileName(final CharSequence candidate) {
458-
if (candidate == null || candidate.length() == 0 || candidate.length() > maxFileNameLength) {
501+
return isLegalFileName(candidate, Charset.defaultCharset());
502+
}
503+
504+
/**
505+
* Tests if a candidate file name (without a path) is a legal file name.
506+
*
507+
* <p>Takes a file name like {@code "filename.ext"} or {@code "filename"} and checks:</p>
508+
* <ul>
509+
* <li>if the file name length is legal</li>
510+
* <li>if the file name is not a reserved file name</li>
511+
* <li>if the file name does not contain illegal characters</li>
512+
* </ul>
513+
*
514+
* @param candidate
515+
* A candidate file name (without a path) like {@code "filename.ext"} or {@code "filename"}
516+
* @param charset
517+
* The charset to use when the file name length is measured in bytes
518+
* @return {@code true} if the candidate name is legal
519+
* @since 2.21.0
520+
*/
521+
public boolean isLegalFileName(final CharSequence candidate, final Charset charset) {
522+
if (!isLegalFileLength(candidate, charset)) {
459523
return false;
460524
}
461525
if (isReservedFileName(candidate)) {
@@ -504,24 +568,53 @@ public boolean supportsDriveLetter() {
504568
}
505569

506570
/**
507-
* Converts a candidate file name (without a path) like {@code "filename.ext"} or {@code "filename"} to a legal file
508-
* name. Illegal characters in the candidate name are replaced by the {@code replacement} character. If the file
509-
* name length exceeds {@link #getMaxFileNameLength()}, then the name is truncated to
510-
* {@link #getMaxFileNameLength()}.
571+
* Converts a candidate file name (without a path) to a legal file name.
572+
*
573+
* <p>Takes a file name like {@code "filename.ext"} or {@code "filename"} and:</p>
574+
* <ul>
575+
* <li>replaces illegal characters by the given replacement character</li>
576+
* <li>truncates the name to {@link #getMaxFileNameLength()} if necessary</li>
577+
* </ul>
511578
*
512579
* @param candidate
513-
* a candidate file name (without a path) like {@code "filename.ext"} or {@code "filename"}
580+
* A candidate file name (without a path) like {@code "filename.ext"} or {@code "filename"}
514581
* @param replacement
515582
* Illegal characters in the candidate name are replaced by this character
516583
* @return a String without illegal characters
517584
*/
518585
public String toLegalFileName(final String candidate, final char replacement) {
586+
return toLegalFileName(candidate, replacement, Charset.defaultCharset());
587+
}
588+
589+
/**
590+
* Converts a candidate file name (without a path) to a legal file name.
591+
*
592+
* <p>Takes a file name like {@code "filename.ext"} or {@code "filename"} and:</p>
593+
* <ul>
594+
* <li>replaces illegal characters by the given replacement character</li>
595+
* <li>truncates the name to {@link #getMaxFileNameLength()} if necessary</li>
596+
* </ul>
597+
*
598+
* @param candidate
599+
* A candidate file name (without a path) like {@code "filename.ext"} or {@code "filename"}
600+
* @param replacement
601+
* Illegal characters in the candidate name are replaced by this character
602+
* @param charset
603+
* The charset to use when the file name length is measured in bytes
604+
* @return a String without illegal characters
605+
* @since 2.21.0
606+
*/
607+
public String toLegalFileName(final String candidate, final char replacement, final Charset charset) {
608+
Objects.requireNonNull(candidate, "candidate");
609+
if (candidate.isEmpty()) {
610+
throw new IllegalArgumentException("The candidate file name is empty");
611+
}
519612
if (isIllegalFileNameChar(replacement)) {
520613
// %s does not work properly with NUL
521614
throw new IllegalArgumentException(String.format("The replacement character '%s' cannot be one of the %s illegal characters: %s",
522615
replacement == '\0' ? "\\0" : replacement, name(), Arrays.toString(illegalFileNameChars)));
523616
}
524-
final String truncated = candidate.length() > maxFileNameLength ? candidate.substring(0, maxFileNameLength) : candidate;
617+
final CharSequence truncated = truncateFileName(candidate, charset);
525618
final int[] array = truncated.chars().map(i -> isIllegalFileNameChar(i) ? replacement : i).toArray();
526619
return new String(array, 0, array.length);
527620
}
@@ -530,4 +623,76 @@ CharSequence trimExtension(final CharSequence cs) {
530623
final int index = indexOf(cs, '.', 0);
531624
return index < 0 ? cs : cs.subSequence(0, index);
532625
}
626+
627+
private boolean isLegalFileLength(final CharSequence candidate, final Charset charset) {
628+
if (candidate == null || candidate.length() == 0) {
629+
return false;
630+
}
631+
if (lengthUnit == LengthUnit.CHARS) {
632+
return candidate.length() <= getMaxFileNameLength();
633+
}
634+
final CharsetEncoder encoder = charset.newEncoder();
635+
try {
636+
final ByteBuffer buffer = encoder.encode(CharBuffer.wrap(candidate));
637+
return buffer.remaining() <= getMaxFileNameLength();
638+
} catch (CharacterCodingException e) {
639+
// If we can't encode, it's not legal
640+
return false;
641+
}
642+
}
643+
644+
CharSequence truncateFileName(final CharSequence candidate, final Charset charset) {
645+
final int maxFileNameLength = getMaxFileNameLength();
646+
// Character-based limit: simple substring if needed.
647+
if (lengthUnit == LengthUnit.CHARS) {
648+
return candidate.length() <= maxFileNameLength ? candidate : candidate.subSequence(0, maxFileNameLength);
649+
}
650+
651+
// Byte-based limit
652+
return truncateByBytes(candidate, charset, maxFileNameLength);
653+
}
654+
655+
static CharSequence truncateByBytes(final CharSequence candidate, final Charset charset, final int maxBytes) {
656+
// Byte-based limit
657+
final CharsetEncoder encoder = charset.newEncoder()
658+
.onMalformedInput(CodingErrorAction.REPORT)
659+
.onUnmappableCharacter(CodingErrorAction.REPORT);
660+
661+
if (!encoder.canEncode(candidate)) {
662+
throw new IllegalArgumentException(
663+
"File name contains characters that cannot be encoded with charset " + charset.name());
664+
}
665+
666+
// Fast path: if even the worst-case expansion fits, we're done.
667+
if (candidate.length() <= Math.floor(maxBytes / encoder.maxBytesPerChar())) {
668+
return candidate;
669+
}
670+
671+
// Slow path: encode into a fixed-size byte buffer.
672+
final ByteBuffer out = ByteBuffer.allocate(maxBytes);
673+
final CharBuffer in = CharBuffer.wrap(candidate);
674+
675+
// Encode until the first character that would exceed the byte budget.
676+
final CoderResult cr = encoder.encode(in, out, true);
677+
678+
if (cr.isUnderflow()) {
679+
// Entire candidate fit within maxFileNameLength bytes.
680+
return candidate;
681+
}
682+
683+
// We ran out of space mid-encode: truncate BEFORE the offending character.
684+
return candidate.subSequence(0, in.position());
685+
}
686+
687+
/**
688+
* Units of length for the file name and path length limits.
689+
*
690+
* @since 2.21.0
691+
*/
692+
public enum LengthUnit {
693+
/** Length in bytes. */
694+
BYTES,
695+
/** Length in UTF-16 characters. */
696+
CHARS;
697+
}
533698
}

0 commit comments

Comments
 (0)