Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
81 commits
Select commit Hold shift + click to select a range
62c0383
Message text correction
fractal3000 Sep 6, 2024
56af748
Parser resolving mechanism refactoring.
fractal3000 Sep 7, 2024
23a2f66
UnsupportedFileExtensionExceptionTest
fractal3000 Sep 7, 2024
e5ed2ce
FileParserResolverTest
fractal3000 Sep 7, 2024
0b630ac
FileParserResolverTest
fractal3000 Sep 7, 2024
c51699d
File type correction
fractal3000 Sep 7, 2024
230e41a
UnsupportedFileExtensionExceptionTest
fractal3000 Sep 7, 2024
a8d720b
Groovy tests correction
fractal3000 Sep 7, 2024
5f81bf0
minor change
fractal3000 Sep 7, 2024
078543b
minor change
fractal3000 Sep 7, 2024
856366c
FilePropertyValueExtractorTest
fractal3000 Sep 7, 2024
939b560
FilePropertyValueExtractorTest enhancement
fractal3000 Sep 9, 2024
f5b0bf0
FilePropertyValueExtractorTest enhancement
fractal3000 Sep 9, 2024
659d3a3
dependencies adding
fractal3000 Sep 9, 2024
6864255
Review correction
fractal3000 Sep 10, 2024
7936b6b
Review correction(exceptions)
fractal3000 Sep 10, 2024
8b2b1bd
Review correction(exceptions)
fractal3000 Sep 10, 2024
dd94a3d
Review correction(exceptions)
fractal3000 Sep 10, 2024
927fd47
Review correction(exceptions)
fractal3000 Sep 10, 2024
dccb7fc
Review correction(exceptions)
fractal3000 Sep 10, 2024
b346752
Parser resolvers
fractal3000 Sep 12, 2024
5b39a4f
Parser resolvers
fractal3000 Sep 12, 2024
a64133d
FileParserResolverManager
fractal3000 Sep 12, 2024
b3444e3
FileParserResolverManager
fractal3000 Sep 12, 2024
5c6dc63
FileProcessorTest
fractal3000 Sep 13, 2024
16bc24d
UnsupportedFileExtensionException
fractal3000 Sep 13, 2024
89aa491
UnsupportedFileExtensionExceptionTest
fractal3000 Sep 13, 2024
7b3e26f
FilePropertyValueExtractorTest
fractal3000 Sep 13, 2024
664c9ce
OpenOfficeDocumentsParserResolver correction
fractal3000 Sep 13, 2024
a35004a
adding necessary dependency for testing purposes
fractal3000 Sep 13, 2024
2f96055
Resolvers correction
fractal3000 Sep 13, 2024
788f231
test correction
fractal3000 Sep 13, 2024
2d0f663
Removing not necessary lines
fractal3000 Sep 13, 2024
96f5f6d
Extensions problem
fractal3000 Sep 13, 2024
ed5865c
Packages reorganizing
fractal3000 Sep 13, 2024
7a15aa3
Message correction.
fractal3000 Sep 13, 2024
317c50c
EmptyFileExtensionException message extending
fractal3000 Sep 13, 2024
9f5d67c
Java doc
fractal3000 Sep 13, 2024
edf2b10
Java doc
fractal3000 Sep 13, 2024
f4d6dae
FileParserResolverManagerIntegrationTest creation and resolvers corre…
fractal3000 Sep 18, 2024
5fd0076
a not necessary extra dependency
fractal3000 Sep 18, 2024
1f1ef1b
Method renaming
fractal3000 Sep 18, 2024
784a353
FileParserResolver class's signature changing
fractal3000 Sep 18, 2024
6256935
UnsupportedFileTypeException correction
fractal3000 Sep 18, 2024
e793ca8
FilePropertyValueExtractorTest correction
fractal3000 Sep 18, 2024
945962a
FileProcessorTest correction
fractal3000 Sep 18, 2024
a71d103
The tests correction
fractal3000 Sep 18, 2024
85fed7c
FileParserResolverManager and the test correction
fractal3000 Sep 18, 2024
0054c3e
AbstractExtensionBasedFileParserResolverTest
fractal3000 Sep 18, 2024
7dd1ab4
AbstractExtensionBasedFileParserResolverTest
fractal3000 Sep 18, 2024
8c1c3b9
JavaDoc
fractal3000 Sep 18, 2024
d6b228e
JavaDoc
fractal3000 Sep 18, 2024
7b7b8d5
minor change
fractal3000 Sep 18, 2024
4b4582c
minor change
fractal3000 Sep 18, 2024
5f5fc33
minor change
fractal3000 Sep 18, 2024
2be1c41
minor change
fractal3000 Sep 18, 2024
1f374ab
FileParserResolverManagerIntegrationTest extending
fractal3000 Sep 18, 2024
2d891cd
minor change
fractal3000 Sep 18, 2024
67753ff
code formatting
fractal3000 Sep 19, 2024
4f0d053
Capital letters checking
fractal3000 Sep 26, 2024
545302b
Removing not necessary custom exception
fractal3000 Sep 26, 2024
c7475c7
Renaming the exception
fractal3000 Sep 26, 2024
d04cf63
Code style changes
fractal3000 Sep 26, 2024
11c0b80
JavaDocs correction
fractal3000 Sep 26, 2024
0bf2067
JavaDocs correction
fractal3000 Sep 26, 2024
8292c88
Test correction
fractal3000 Sep 26, 2024
33fcc94
List to Set changing
fractal3000 Sep 26, 2024
6b07864
FileParserResolverManager -> FileParserProvider
fractal3000 Sep 26, 2024
51ee3fd
Message text correction
fractal3000 Sep 26, 2024
e8016ff
OldMSOfficeDocumentsParserResolver > LegacyMSOfficeDocumentsParserRes…
fractal3000 Sep 26, 2024
b6bf51f
JavaDoc
fractal3000 Sep 26, 2024
6da446f
Getting FileParsingBundle with FileParserResolver
fractal3000 Sep 26, 2024
b744d63
Comment adding
fractal3000 Sep 26, 2024
1d056e6
Uppercase extensions' support
fractal3000 Sep 27, 2024
be96ef6
FileParsingBundle -> FileParserKit
fractal3000 Sep 27, 2024
11453d8
BodyContentHandler -> ContentHandler
fractal3000 Sep 27, 2024
59c6a88
MSOfficeDocumentsParserResolver
fractal3000 Sep 27, 2024
5319a28
A fixing of the wrong merging
fractal3000 Sep 18, 2025
b24d7d3
Not necessary util method
fractal3000 Sep 18, 2025
6f600d9
Tests correction
fractal3000 Sep 19, 2025
b563375
Откат некорректного переименования
fractal3000 Sep 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion jmix-search/search/search.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,9 @@ dependencies {
testImplementation 'org.junit.jupiter:junit-jupiter-engine'
testImplementation 'org.junit.jupiter:junit-jupiter-params'
testImplementation 'org.junit.vintage:junit-vintage-engine'
testImplementation 'org.spockframework:spock-core'
testImplementation 'org.mockito:mockito-core'
testImplementation "org.spockframework:spock-core"
testImplementation 'ch.qos.logback:logback-classic'
testRuntimeOnly 'org.slf4j:slf4j-simple'
testRuntimeOnly 'org.hsqldb:hsqldb'
testRuntimeOnly 'org.junit.platform:junit-platform-launcher'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,29 @@

package io.jmix.search.exception;

import org.apache.commons.io.FilenameUtils;
import java.util.List;

/**
* An exception that is thrown when a user added some file of the type that is not supported
* and there are no any known parser for.
*/
public class UnsupportedFileFormatException extends Exception {

public static final String MESSAGE = "The file %s with the '%s' extension is not supported.";
private static final String MESSAGE = "The file %s can't be parsed. " +
"Only the following file parsing criteria are supported:\n -%s";

/**
* @param fileName the name of the file which type is not supported
* @param supportedExtensions the list of the criteria that are supported in the application
*/
public UnsupportedFileFormatException(String fileName, List<String> supportedExtensions) {
super(String.format(
MESSAGE,
fileName,
getSupportedExtensionsString(supportedExtensions)));
}

public UnsupportedFileFormatException(String fileName) {
super(String.format(MESSAGE, fileName, FilenameUtils.getExtension(fileName)));
protected static String getSupportedExtensionsString(List<String> supportedExtensions) {
return String.join("\n -", supportedExtensions);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
* Copyright 2024 Haulmont.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.jmix.search.index.fileparsing;

import com.google.common.base.Strings;
import io.jmix.core.FileRef;
import org.apache.commons.io.FilenameUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;

import java.io.StringWriter;
import java.util.Set;
import java.util.function.Function;

/**
* Implements the common logic for all extension based file parser resolvers.
*/
public abstract class AbstractExtensionBasedFileParserResolver implements FileParserResolver {

/**
* Returns a collection of supported extensions of the supported file type.
* Note that the extension checking mechanism is case-sensitive. So in order to support
* the both uppercase one and lowercase option of the extension they should be defined explicitly.
* E.g. ["xlsx", "XLSX", "docx", "DOCX"].
*
* @return collection of supported extensions
*/
public abstract Set<String> getSupportedExtensions();

@Override
public String getCriteriaDescription() {
return String.format(
"File parser resolver: %s. Supported extensions: %s.",
this.getClass().getSimpleName(),
getSupportedExtensionsString(getSupportedExtensions()));
}

@Override
public boolean supports(FileRef fileRef) {
String fileName = fileRef.getFileName();
String fileExtension = FilenameUtils.getExtension(fileName);
if (Strings.isNullOrEmpty(fileExtension)) {
return false;
}

return getSupportedExtensions().contains(fileExtension);
}

protected String getSupportedExtensionsString(Set<String> supportedExtensions) {
return String.join(", ", supportedExtensions);
}

@Override
public FileParserKit getParserKit() {
return new FileParserKit(
getParser(),
getContentHandlerGenerator(),
getMetadata(),
getParseContext());
}

/**
* Returns a parser for the supported file type.
*/
protected abstract Parser getParser();

/**
* Returns a function for the ContentHandler generating that is necessary for the given file parsing.
*/
protected Function<StringWriter, ContentHandler> getContentHandlerGenerator() {
return stringWriter -> new BodyContentHandler(stringWriter);
}

/**
* Returns a Metadata object for the given file parsing.
*/
protected Metadata getMetadata() {
return new Metadata();
}

/**
* Returns a ParseContext object for the given file parsing.
*/
protected ParseContext getParseContext() {
return new ParseContext();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* Copyright 2024 Haulmont.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.jmix.search.index.fileparsing;

import jakarta.validation.constraints.NotNull;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.xml.sax.ContentHandler;

import java.io.StringWriter;
import java.util.function.Function;

public record FileParserKit(
@NotNull Parser parser,
@NotNull Function<StringWriter, ContentHandler> contentHandlerGenerator,
@NotNull Metadata metadata,
@NotNull ParseContext parseContext) {}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright 2024 Haulmont.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.jmix.search.index.fileparsing;

import io.jmix.core.FileRef;

/**
* Interface to be implemented for adding a custom file parser resolver
* or modifying the behavior of the existing file parser resolvers. It gives an ability to define the exact parser
* for the exact file types with a custom implementation of the file checking logic. These parsers are used to extract
* file content for sending it to the search server and indexing.
*/
public interface FileParserResolver {

/**
* Returns the description of the criteria for the files that are supported with this resolver.
* This text is used for generating the log message that is written into the log
* while no one of the resolvers supports the processing of the given file.
*
* @return criteria description
*/
String getCriteriaDescription();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we actually need this public API method?
Isn't it better to delegate this logic to the final consumer? The only purpose of this is to generate message like 'The file extension should be one of the following: ...'.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The consumer doesn't know anything about the file checking criteria that are implemented in the resolvers. The aim was to give to the user ability to get comprehensive information what is going wrong. If we remove this method we just could say that "A resolver(and parser) for the file couldn't be found".

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The message of the AbstractExtensionBasedFileParserResolver was corrected.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The method was left without changes as it was discussed.


/**
* Returns a complex object that contains all necessary objects for the supported file type parsing.
*
* @return an instance of a file parser kit
*/
FileParserKit getParserKit();

/**
* Returns the result of the checking if the file with the given fileRef is supported by the resolver or not.
*
* @param fileRef object with the file information
* @return the given FileRef's checking result
*/
boolean supports(FileRef fileRef);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/*
* Copyright 2020 Haulmont.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

@NonNullApi
package io.jmix.search.index.fileparsing;

import org.springframework.lang.NonNullApi;
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Copyright 2024 Haulmont.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.jmix.search.index.fileparsing.resolvers;

import io.jmix.search.index.fileparsing.AbstractExtensionBasedFileParserResolver;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.springframework.core.annotation.Order;
import org.springframework.stereotype.Component;

import java.util.Set;

@Component("search_LegacyMSOfficeDocumentsParserResolver")
@Order(100)
public class LegacyMSOfficeDocumentsParserResolver extends AbstractExtensionBasedFileParserResolver {

@Override
public Set<String> getSupportedExtensions() {
return Set.of("doc", "xls", "DOC", "XLS");
}

@Override
public Parser getParser() {
return new OfficeParser();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright 2024 Haulmont.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.jmix.search.index.fileparsing.resolvers;

import io.jmix.search.index.fileparsing.AbstractExtensionBasedFileParserResolver;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.springframework.core.annotation.Order;
import org.springframework.stereotype.Component;

import java.util.Set;

@Component("search_OfficeDocumentsParserResolver")
@Order(100)
public class MSOfficeDocumentsParserResolver extends AbstractExtensionBasedFileParserResolver {

@Override
public Set<String> getSupportedExtensions() {
return Set.of("docx", "xlsx", "DOCX", "XLSX");
}

@Override
public Parser getParser() {
return new OOXMLParser();
}

@Override
protected ParseContext getParseContext() {
ParseContext parseContext = super.getParseContext();

OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setIncludeHeadersAndFooters(false);
parseContext.set(OfficeParserConfig.class, officeParserConfig);

return parseContext;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Copyright 2024 Haulmont.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.jmix.search.index.fileparsing.resolvers;

import io.jmix.search.index.fileparsing.AbstractExtensionBasedFileParserResolver;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.odf.OpenDocumentParser;
import org.springframework.core.annotation.Order;
import org.springframework.stereotype.Component;

import java.util.Set;

@Component("search_OpenOfficeDocumentsParserResolver")
@Order(100)
public class OpenOfficeDocumentsParserResolver extends AbstractExtensionBasedFileParserResolver {

@Override
public Set<String> getSupportedExtensions() {
return Set.of("odt", "ods", "ODT", "ODS");
}

@Override
public Parser getParser() {
return new OpenDocumentParser();
}
}
Loading