Skip to content

Commit cd5aa22

Browse files
Updated FileUtil to include MetaData hints when calling tika to help tika identify a files content type.
1 parent 70d2310 commit cd5aa22

File tree

5 files changed

+107
-19
lines changed

5 files changed

+107
-19
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
* commons-beanutils:commons-beanutils:1.11.0 to org.apache.commons:commons-beanutils2:2.0.0-M2
1717
* Updated antisamy to latest version 1.7.8 as it has reinstated the xHTML behaviour for tags. Versions 1.7.0 to 1.7.6 did not support xHTML and would break the XML.
1818
* org.owasp.antisamy:antismay from 1.6.8 to 1.7.8
19+
* Updated FileUtil to include MetaData hints when calling tika to help tika identify a files content type.
1920

2021
### Bug Fixes
2122

wcomponents-core/src/main/java/com/github/bordertech/wcomponents/util/FileUtil.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import org.apache.commons.logging.Log;
1111
import org.apache.commons.logging.LogFactory;
1212
import org.apache.tika.Tika;
13+
import org.apache.tika.metadata.Metadata;
14+
import org.apache.tika.metadata.TikaCoreProperties;
1315

1416
/**
1517
* Utility methods for {@link File}.
@@ -98,7 +100,15 @@ public static String getFileMimeType(final File file) {
98100
if (file != null) {
99101
try {
100102
final Tika tika = new Tika();
101-
return tika.detect(file.getInputStream());
103+
// Setup metatdata hints to help Tika detect the mime type
104+
Metadata meta = new Metadata();
105+
if (file.getName() != null) {
106+
meta.set(TikaCoreProperties.RESOURCE_NAME_KEY, file.getName());
107+
}
108+
if (file.getMimeType() != null) {
109+
meta.set(TikaCoreProperties.CONTENT_TYPE_HINT, file.getMimeType());
110+
}
111+
return tika.detect(file.getInputStream(), meta);
102112
} catch (IOException ex) {
103113
LOG.error("Invalid file, name " + file.getName(), ex);
104114
}

wcomponents-core/src/test/java/com/github/bordertech/wcomponents/util/FileUtil_Test.java

Lines changed: 93 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package com.github.bordertech.wcomponents.util;
22

3+
import com.github.bordertech.wcomponents.file.File;
34
import com.github.bordertech.wcomponents.file.FileItemWrap;
45
import com.github.bordertech.wcomponents.util.mock.MockFileItem;
56
import java.io.IOException;
@@ -12,6 +13,7 @@
1213

1314
/**
1415
* FileUtil_Test - unit test for {@link FileUtil}.
16+
*
1517
* @author Aswin Kandula
1618
* @since 1.5
1719
*/
@@ -32,7 +34,7 @@ public void testValidateFileTypeImageFile() throws IOException {
3234
public void testValidateFileTypeAnyFile() throws IOException {
3335
boolean validateFileType = FileUtil.validateFileType(null, null);
3436
Assert.assertFalse(validateFileType);
35-
37+
3638
FileItem newFileItem = createFileItem(null);
3739
validateFileType = FileUtil.validateFileType(new FileItemWrap(newFileItem), Collections.EMPTY_LIST);
3840
Assert.assertTrue(validateFileType);
@@ -44,35 +46,35 @@ public void testValidateFileTypePdfFile() throws IOException {
4446
boolean validateFileType = FileUtil.validateFileType(new FileItemWrap(newFileItem), Arrays.asList("application/pdf"));
4547
Assert.assertTrue(validateFileType);
4648
}
47-
49+
4850
@Test
4951
public void testValidateFileTypeTr5File() throws IOException {
5052
// 'tr5' file has no mime type, so validation will pass with extension only.
5153
FileItem newFileItem = createFileItem("/content/test.tr5");
5254
boolean validateFileType = FileUtil.validateFileType(new FileItemWrap(newFileItem), Arrays.asList(".tr5"));
5355
Assert.assertTrue(validateFileType);
54-
56+
5557
newFileItem = createFileItem("/content/test.tr5");
5658
validateFileType = FileUtil.validateFileType(new FileItemWrap(newFileItem), Arrays.asList("text/plain"));
5759
Assert.assertTrue(validateFileType);
58-
60+
5961
newFileItem = createFileItem("/content/test.tr5");
6062
validateFileType = FileUtil.validateFileType(new FileItemWrap(newFileItem), Arrays.asList("image/jpg"));
6163
Assert.assertFalse(validateFileType);
6264
}
63-
65+
6466
@Test
6567
public void testValidateFileTypeDodgyTr5File() throws IOException {
6668
// 'tr5' file has no mime type, so validation will pass with extension only.
6769
FileItem newFileItem = createFileItem("/content/dodgy.pdf.tr5");
6870
boolean validateFileType = FileUtil.validateFileType(new FileItemWrap(newFileItem), Arrays.asList(".tr5"));
6971
Assert.assertTrue(validateFileType);
70-
72+
7173
newFileItem = createFileItem("/content/dodgy.pdf.tr5");
7274
validateFileType = FileUtil.validateFileType(new FileItemWrap(newFileItem), Arrays.asList("text/plain"));
7375
Assert.assertFalse(validateFileType);
7476
}
75-
77+
7678
@Test
7779
public void testValidateFileSize() throws IOException {
7880
FileItem newFileItem = createFileItem(null);
@@ -81,20 +83,20 @@ public void testValidateFileSize() throws IOException {
8183

8284
validateFileSize = FileUtil.validateFileSize(new FileItemWrap(newFileItem), 50);
8385
Assert.assertFalse(validateFileSize);
84-
86+
8587
FileUtil.validateFileSize(null, 0);
8688
Assert.assertFalse(validateFileSize);
87-
89+
8890
validateFileSize = FileUtil.validateFileSize(new FileItemWrap(newFileItem), -1000);
8991
Assert.assertTrue(validateFileSize);
9092
}
91-
93+
9294
@Test
9395
public void testReadableFileSize() {
9496
String readableFileSize = FileUtil.readableFileSize(10101);
9597
Assert.assertEquals("10.1 KB", readableFileSize);
9698
}
97-
99+
98100
@Test
99101
public void testGetInvalidFileTypeMessage() {
100102
String invalidFileTypeMessage = FileUtil.getInvalidFileTypeMessage(null);
@@ -103,16 +105,40 @@ public void testGetInvalidFileTypeMessage() {
103105
invalidFileTypeMessage = FileUtil.getInvalidFileTypeMessage(Arrays.asList("*"));
104106
Assert.assertEquals("The file you have selected is not of an accepted type. Only the following type/s are accepted: *.", invalidFileTypeMessage);
105107
}
106-
108+
107109
@Test
108110
public void testGetInvalidFileSizeMessage() {
109111
String invalidFileSizeMessage = FileUtil.getInvalidFileSizeMessage(1111);
110112
Assert.assertEquals("The file you have selected is too large. Maximum file size is 1.1 KB.", invalidFileSizeMessage);
111113
}
112-
114+
115+
@Test
116+
public void testGetMimeTypeForTextFileAndNoHint() throws IOException {
117+
// Tika text detector by default will treat a file with less than 10% non-ascii characters as a text file
118+
// Test Tika detects the file as text with no hint
119+
MyMockFile file = new MyMockFile("/content/text-non-ascii-less-10-per.txt", null, null);
120+
Assert.assertEquals("Incorrect type for text file that should have been detected as text with no hints", "text/plain", FileUtil.getFileMimeType(file));
121+
}
122+
123+
@Test
124+
public void testGetMimeTypeForTextFileWithAsciiAndNoHint() throws IOException {
125+
// Tika text detector by default will treat a file with more than 10% non-ascii characters as not a text file
126+
// Test providing no hint of the file name to Tika that it wont detect it as text
127+
MyMockFile file = new MyMockFile("/content/text-non-ascii-more-10-per.txt", null, null);
128+
Assert.assertEquals("Incorrect type for text file that should not be detected as text with no hints", "application/octet-stream", FileUtil.getFileMimeType(file));
129+
}
130+
131+
@Test
132+
public void testGetMimeTypeForTextFileWithAsciiAndNameHint() throws IOException {
133+
// Tika text detector by default will treat a file with more than 10% non-ascii characters as not a text file
134+
// Test providing a hint of the file name to Tika will detect it as text
135+
MyMockFile file = new MyMockFile("/content/text-non-ascii-more-10-per.txt", "text-non-ascii-more-10-per.txt", null);
136+
Assert.assertEquals("Incorrect type for text file that should be detected as text with name hint", "text/plain", FileUtil.getFileMimeType(file));
137+
}
138+
113139
/**
114140
* Create a new fileitem.
115-
*
141+
*
116142
* @param fileResource if {@code null} dummy byte[] are set on file, otherwise given file resource.
117143
* @return a file item
118144
*/
@@ -123,21 +149,70 @@ private FileItem createFileItem(String fileResource) throws IOException {
123149
for (int i = 0; i < testFileContent.length; i++) {
124150
testFileContent[i] = (byte) (i & 0xff);
125151
}
126-
}
127-
else {
152+
} else {
128153
InputStream stream = getClass().getResourceAsStream(fileResource);
129154
if (stream == null) {
130155
throw new IOException("File resource not found: " + fileResource);
131156
}
132-
testFileContent = StreamUtil.getBytes(stream);
157+
testFileContent = StreamUtil.getBytes(stream);
133158
}
134159
MockFileItem fileItem = new MockFileItem();
135160
fileItem.set(testFileContent);
136161
fileItem.setFieldName(fileResource);
137-
if (fileResource != null) {
162+
if (fileResource != null) {
138163
String[] tokens = fileResource.split(".+?/(?=[^/]+$)");
139164
fileItem.setName(tokens[1]);
140165
}
141166
return fileItem;
142167
}
168+
169+
private static class MyMockFile implements File {
170+
171+
private final String fileResource;
172+
private final String name;
173+
private final String mimeType;
174+
175+
public MyMockFile(String fileResource, String name, String mimeType) {
176+
this.fileResource = fileResource;
177+
this.name = name;
178+
this.mimeType = mimeType;
179+
}
180+
181+
@Override
182+
public InputStream getInputStream() throws IOException {
183+
return getClass().getResourceAsStream(fileResource);
184+
}
185+
186+
@Override
187+
public String getName() {
188+
return name;
189+
}
190+
191+
@Override
192+
public String getMimeType() {
193+
return mimeType;
194+
}
195+
196+
@Override
197+
public long getSize() {
198+
throw new UnsupportedOperationException("Not used in test");
199+
}
200+
201+
@Override
202+
public String getFileName() {
203+
throw new UnsupportedOperationException("Not used in test");
204+
}
205+
206+
@Override
207+
public byte[] getBytes() {
208+
throw new UnsupportedOperationException("Not used in test");
209+
}
210+
211+
@Override
212+
public String getDescription() {
213+
throw new UnsupportedOperationException("Not used in test");
214+
}
215+
216+
}
217+
143218
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
��XXXXXXXXXXXXXXXXXX
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
�����XX

0 commit comments

Comments
 (0)