Skip to content

Commit feecb6d

Browse files
Juli0qsschuberth
authored andcommitted
feat(model): Detect and exclude binary license files
Detect and exclude binary license files using Apache Tika. When a non-text file is found during the license info creation process, a warning is printed, and it is excluded from the final report. This prevents the inclusion of binary files that previously caused the reporter to enter an endless loop during report generation. Signed-off-by: Julian Olderdissen <[email protected]>
1 parent 8076f0d commit feecb6d

File tree

4 files changed

+37
-2
lines changed

4 files changed

+37
-2
lines changed

gradle/libs.versions.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ slf4j = "2.0.17"
6767
springCore = "6.2.7"
6868
svnkit = "1.10.12"
6969
sw360Client = "17.0.1-m2"
70+
tika = "3.1.0"
7071
wagonHttp = "3.5.3"
7172
wiremock = "3.13.0"
7273
xmlutil = "0.91.1"
@@ -186,6 +187,7 @@ slf4j = { module = "org.slf4j:slf4j-api ", version.ref = "slf4j" }
186187
springCore = { module = "org.springframework:spring-core", version.ref = "springCore" }
187188
svnkit = { module = "com.tmatesoft.svnkit:svnkit", version.ref = "svnkit" }
188189
sw360Client = { module = "org.eclipse.sw360:client", version.ref = "sw360Client" }
190+
tika = { module = "org.apache.tika:tika-core", version.ref = "tika" }
189191
wagon-http = { module = "org.apache.maven.wagon:wagon-http", version.ref = "wagonHttp" }
190192
wiremock = { module = "org.wiremock:wiremock", version.ref = "wiremock" }
191193
xz = { module = "org.tukaani:xz", version.ref = "xz" }

model/build.gradle.kts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ dependencies {
4242
implementation(libs.jackson.module.kotlin)
4343
implementation(libs.postgres)
4444
implementation(libs.semver4j)
45+
implementation(libs.tika)
4546

4647
testFixturesImplementation(projects.utils.testUtils)
4748

model/src/main/kotlin/utils/FileArchiver.kt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ import kotlin.time.measureTime
2525
import kotlin.time.measureTimedValue
2626

2727
import org.apache.logging.log4j.kotlin.logger
28+
import org.apache.tika.Tika
29+
import org.apache.tika.mime.MimeTypes
2830

2931
import org.ossreviewtoolkit.model.KnownProvenance
3032
import org.ossreviewtoolkit.utils.common.FileMatcher
@@ -72,11 +74,17 @@ class FileArchiver(
7274
logger.info { "Archiving files matching ${matcher.patterns} from '$directory'..." }
7375

7476
val zipFile = createOrtTempFile(suffix = ".zip")
77+
val tika = Tika()
7578

7679
val zipDuration = measureTime {
7780
directory.packZip(zipFile, overwrite = true) { file ->
7881
val relativePath = file.relativeTo(directory).invariantSeparatorsPath
7982

83+
if (tika.detect(file) != MimeTypes.PLAIN_TEXT) {
84+
logger.warn { "Not adding file '$relativePath' to archive because it is not a text file." }
85+
return@packZip false
86+
}
87+
8088
matcher.matches(relativePath).also { result ->
8189
logger.debug {
8290
if (result) {

model/src/test/kotlin/utils/FileArchiverTest.kt

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,10 @@ import io.kotest.core.spec.style.StringSpec
2323
import io.kotest.core.test.TestCase
2424
import io.kotest.engine.spec.tempdir
2525
import io.kotest.matchers.file.aFile
26+
import io.kotest.matchers.file.containFile
2627
import io.kotest.matchers.file.exist
2728
import io.kotest.matchers.file.shouldContainNFiles
29+
import io.kotest.matchers.should
2830
import io.kotest.matchers.shouldBe
2931
import io.kotest.matchers.shouldNot
3032

@@ -60,10 +62,10 @@ class FileArchiverTest : StringSpec() {
6062
storage = FileProvenanceFileStorage(LocalFileStorage(storageDir), FileArchiverConfiguration.ARCHIVE_FILENAME)
6163
}
6264

63-
private fun createFile(path: String) {
65+
private fun createFile(path: String, write: File.() -> Unit = { writeText(path) }) {
6466
val file = workingDir.resolve(path)
6567
file.parentFile.safeMkdirs()
66-
file.writeText(path)
68+
file.write()
6769
}
6870

6971
/**
@@ -162,5 +164,27 @@ class FileArchiverTest : StringSpec() {
162164
archiver.unarchive(targetDir, PROVENANCE) shouldBe true
163165
targetDir shouldContainNFiles 0
164166
}
167+
168+
"exclude basic binary license file" {
169+
createFile("License") { writeBytes(byteArrayOf(0xFF.toByte(), 0xD8.toByte())) }
170+
171+
val archiver = FileArchiver.createDefault()
172+
archiver.archive(workingDir, PROVENANCE)
173+
val result = archiver.unarchive(targetDir, PROVENANCE)
174+
175+
result shouldBe true
176+
targetDir shouldNot containFile("License")
177+
}
178+
179+
"include utf8 file with japanese chars" {
180+
createFile("License") { writeText("ぁあぃいぅうぇえぉおかが") }
181+
182+
val archiver = FileArchiver.createDefault()
183+
archiver.archive(workingDir, PROVENANCE)
184+
val result = archiver.unarchive(targetDir, PROVENANCE)
185+
186+
result shouldBe true
187+
targetDir should containFile("License")
188+
}
165189
}
166190
}

0 commit comments

Comments
 (0)