Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
00657f5
Improve FakeMatcher to make tests easier to read
seadowg Dec 18, 2024
0da81ec
Add naive implementation for pmatch
seadowg Dec 18, 2024
81789c9
Add support for multiple templates per subject
seadowg Dec 18, 2024
fc54e43
Add error handling or CSVs missing a header
seadowg Dec 18, 2024
d750000
Start using Commons CSV parsing
seadowg Dec 18, 2024
1fc8db9
Flatten list used to find matches
seadowg Dec 19, 2024
98e47dd
Calculate matches with a parallel fold
seadowg Dec 19, 2024
52f308c
Add test to ensure parallelism
seadowg Dec 19, 2024
c064a4e
Add ability to tweak thread count
seadowg Dec 19, 2024
a9ad6e7
Add ability to tweak threshold
seadowg Dec 19, 2024
09cfbd5
Output all scores between templates for a match
seadowg Dec 19, 2024
40be892
Add pmatch to help test
seadowg Dec 20, 2024
0e2d158
Add help details for pmatch
seadowg Dec 20, 2024
4241021
Move parallelFold to Sequence
seadowg Sep 1, 2025
1726f84
Upgrade SourceAFIS
seadowg Oct 13, 2025
706c0fe
Allow templates to be converted just once
seadowg Oct 13, 2025
d2eea59
Rename interface
seadowg Oct 13, 2025
7e3b516
Use class instead of Pair
seadowg Oct 13, 2025
954960d
Correct operation name
seadowg Oct 13, 2025
fa7ae7e
Remove use of Set
seadowg Oct 13, 2025
ae3f413
Update test name
seadowg Oct 13, 2025
7effbdb
Switch parallel flat map output to Iterable
seadowg Dec 11, 2025
b2b20fa
Switch to use Cursor pattern
seadowg Dec 11, 2025
9afaae7
Make all items are included in parallelFlatMap
seadowg Dec 11, 2025
5e20c4d
Use coroutines instead of Futures
seadowg Dec 12, 2025
50cc53f
Add window size arg
seadowg Dec 12, 2025
7b3d6bc
Add benchmark vs sequential processing
seadowg Dec 12, 2025
2afc1c5
Rename test
seadowg Dec 12, 2025
1802982
Make sure window size is passed down
seadowg Dec 12, 2025
6066583
add dockerfile for keppel-cli image
HusnaHariz Jul 21, 2025
29f538c
add docker image github actions workflow
HusnaHariz Jul 21, 2025
3a361b9
update docker-image workflow
HusnaHariz Jul 21, 2025
e007d3c
update gradle version
HusnaHariz Sep 23, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions .github/workflows/docker-image.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
name: Create and publish a Docker image

# Configures this workflow to run every time a change is pushed to the branch called `main`.
on:
push:
branches: ['master', 'publish-deduplication-docker-image']

# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}

# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
jobs:
build-and-push-image:
runs-on: ubuntu-latest
# Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
permissions:
contents: read
packages: write
attestations: write
id-token: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
# Uses the `docker/login-action` action to log in to the Container registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here.
- name: Log in to the Container registry
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
# This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels.
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
flavor: |
latest=true
tags: |
type=ref,event=branch
type=sha
# This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages.
# It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository.
# It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step.
- name: Build and push Docker image
id: push
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
with:
context: .
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
# This step generates an artifact attestation for the image, which is an unforgettable statement about where and how it was built. It increases supply chain security for people who consume the image. For more information, see "[AUTOTITLE](/actions/security-guides/using-artifact-attestations-to-establish-provenance-for-builds)."
- name: Generate artifact attestation
uses: actions/attest-build-provenance@v1
with:
subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}}
subject-digest: ${{ steps.push.outputs.digest }}
push-to-registry: true
7 changes: 5 additions & 2 deletions CLI/build.gradle
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
plugins {
id 'org.jetbrains.kotlin.jvm' version '1.3.61'
id 'org.jetbrains.kotlin.jvm' version '2.1.0'
id 'application'
}

Expand All @@ -11,15 +11,18 @@ repositories {
}

dependencies {
implementation 'com.machinezoo.sourceafis:sourceafis:3.8.1'
implementation 'com.machinezoo.sourceafis:sourceafis:3.18.1'
implementation 'commons-codec:commons-codec:1.14'
implementation platform('org.jetbrains.kotlin:kotlin-bom')
implementation 'org.jetbrains.kotlin:kotlin-stdlib-jdk8'
implementation group: 'org.slf4j', name: 'slf4j-nop', version: '1.7.30'
implementation("com.github.ajalt:clikt:2.5.0")
implementation 'org.apache.commons:commons-csv:1.12.0'
implementation "org.jetbrains.kotlinx:kotlinx-coroutines-core:1.10.2"
testImplementation 'org.jetbrains.kotlin:kotlin-test'
testImplementation 'org.jetbrains.kotlin:kotlin-test-junit'
testImplementation "com.nhaarman.mockitokotlin2:mockito-kotlin:2.2.0"
testImplementation 'org.hamcrest:hamcrest:3.0'
}

application {
Expand Down
9 changes: 6 additions & 3 deletions CLI/src/main/kotlin/uk/ac/lshtm/keppel/cli/App.kt
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,21 @@ import com.github.ajalt.clikt.core.PrintHelpMessage
import com.github.ajalt.clikt.core.subcommands

fun main(args: Array<String>) {
App(SourceAFISMatcher(), 40.0).execute(args.toList(), StdoutLogger())
App(SourceAFISTemplateFactory(), 40.0).execute(args.toList(), StdoutLogger())
}

class App(private val matcher: Matcher,
class App(private val templateFactory: TemplateFactory,
private val defaultThreshold: Double) {
fun execute(args: List<String>, logger: Logger) {
class Root : CliktCommand(name = "keppel") {
override fun run() = Unit
}

try {
Root().subcommands(MatchCommand(matcher, defaultThreshold, logger)).parse(args)
Root().subcommands(
MatchCommand(templateFactory, defaultThreshold, logger),
PMatchCommand(templateFactory, defaultThreshold)
).parse(args)
} catch (e: PrintHelpMessage) {
logger.log(e.command.getFormattedHelp())
} catch (e: Exception) {
Expand Down
18 changes: 10 additions & 8 deletions CLI/src/main/kotlin/uk/ac/lshtm/keppel/cli/MatchCommand.kt
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@ import com.github.ajalt.clikt.parameters.types.double
import java.io.File

class MatchCommand(
private val matcher: Matcher,
private val defaultThreshold: Double,
private val logger: Logger) : CliktCommand(
name = "match",
help = "Match two hex encoded ISO fingerprint templates. Threshold used for matching is $defaultThreshold.") {
private val templateFactory: TemplateFactory,
private val defaultThreshold: Double,
private val logger: Logger
) : CliktCommand(
name = "match",
help = "Match two hex encoded ISO fingerprint templates. Threshold used for matching is $defaultThreshold."
) {

private val plainText by option("-p", help = Strings.PLAIN_TEXT_HELP).flag(default = false)
private val matchWithScore by option("-ms", help = Strings.MATCH_WITH_SCORE_HELP).flag(default = false)
Expand All @@ -24,12 +26,12 @@ class MatchCommand(

override fun run() {
val (templateOne, templateTwo) = if (plainText) {
Pair(templateOne.toByteArray(), templateTwo.toByteArray())
Pair(templateFactory.getTemplate(templateOne.toByteArray()), templateFactory.getTemplate(templateTwo.toByteArray()))
} else {
Pair(readAndTrim(File(templateOne)), readAndTrim(File(templateTwo)))
Pair(templateFactory.getTemplate(readAndTrim(File(templateOne))), templateFactory.getTemplate(readAndTrim(File(templateTwo))))
}

val score = matcher.match(templateOne, templateTwo)
val score = templateOne.match(templateTwo)
if (matchWithScore) {
if (isMatch(score)) {
logger.log("match_$score")
Expand Down
6 changes: 0 additions & 6 deletions CLI/src/main/kotlin/uk/ac/lshtm/keppel/cli/Matcher.kt

This file was deleted.

49 changes: 49 additions & 0 deletions CLI/src/main/kotlin/uk/ac/lshtm/keppel/cli/PMatchCommand.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package uk.ac.lshtm.keppel.cli

import com.github.ajalt.clikt.core.CliktCommand
import com.github.ajalt.clikt.parameters.options.option
import com.github.ajalt.clikt.parameters.options.required
import com.github.ajalt.clikt.parameters.types.double
import com.github.ajalt.clikt.parameters.types.int
import uk.ac.lshtm.keppel.cli.subject.SubjectParser
import uk.ac.lshtm.keppel.cli.subject.SubjectUseCases
import java.io.File

class PMatchCommand(
private val templateFactory: TemplateFactory,
private val defaultThreshold: Double
) :
CliktCommand(
name = "pmatch",
help = "Find matches between subjects from an input CSV. Threshold used for matching is $defaultThreshold."
) {

private val inputCsvPath by option("-i", help = Strings.INPUT_CSV_HELP).required()
private val outputCsvPath by option("-o", help = Strings.OUTPUT_CSV_HELP).required()
private val parallelism by option("-p", help = Strings.PARALLELISM_HELP).int()
private val threshold by option("-t", help = Strings.THRESHOLD_HELP).double()
private val windowSize by option("-w").int()

override fun run() {
val subjects = try {
SubjectParser.parseCsv(inputCsvPath)
} catch (e: SubjectParser.BadHeaderException) {
throw IllegalArgumentException(Strings.ERROR_PMATCH_NO_HEADER_ROW)
}

val matches = SubjectUseCases.findMatches(subjects, templateFactory, threshold ?: defaultThreshold, parallelism, windowSize)

val outputCsv = File(outputCsvPath)
outputCsv.printWriter().use { writer ->
matches.forEachIndexed { index, match ->
if (index == 0) {
val header = "id_1, id_2, " + (1..match.scores.size).joinToString(", ") { "score_$it" }
writer.println(header)
}

val row = "${match.id1}, ${match.id2}, " + match.scores.joinToString(", ")
writer.println(row)
}
}
}
}
17 changes: 0 additions & 17 deletions CLI/src/main/kotlin/uk/ac/lshtm/keppel/cli/SourceAFISMatcher.kt

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package uk.ac.lshtm.keppel.cli

import com.machinezoo.sourceafis.FingerprintCompatibility.importTemplate
import com.machinezoo.sourceafis.FingerprintMatcher
import com.machinezoo.sourceafis.FingerprintTemplate
import org.apache.commons.codec.binary.Hex

class SourceAFISTemplateFactory : TemplateFactory {
override fun getTemplate(bytes: ByteArray): Template {
return SourceAFISTemplate(bytes)
}
}

private class SourceAFISTemplate(private val bytes: ByteArray) : Template {

val template: FingerprintTemplate by lazy { importTemplate(Hex.decodeHex(String(bytes))) }
private val matcher by lazy { FingerprintMatcher(template) }

override fun match(other: Template): Double {
return matcher.match((other as SourceAFISTemplate).template)
}
}
8 changes: 7 additions & 1 deletion CLI/src/main/kotlin/uk/ac/lshtm/keppel/cli/Strings.kt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,10 @@ object Strings {
const val MATCH_WITH_SCORE_HELP = "Return whether templates match along with score like \"match_210.124\""
const val MATCH_WITHOUT_SCORE_HELP = "Return whether templates match (either \"match\" or \"mismatch\")"
const val THRESHOLD_HELP = "Threshold (score) to be used to determine whether templates are a match or mismatch"
}

const val INPUT_CSV_HELP = "Path to input CSV"
const val OUTPUT_CSV_HELP = "Path to output CSV"
const val PARALLELISM_HELP = "How many threads should be used"

const val ERROR_PMATCH_NO_HEADER_ROW = "Input CSV does not have header row!"
}
10 changes: 10 additions & 0 deletions CLI/src/main/kotlin/uk/ac/lshtm/keppel/cli/TemplateFactory.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package uk.ac.lshtm.keppel.cli

interface TemplateFactory {

fun getTemplate(bytes: ByteArray): Template
}

interface Template {
fun match(other: Template): Double
}
3 changes: 3 additions & 0 deletions CLI/src/main/kotlin/uk/ac/lshtm/keppel/cli/subject/Subject.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
package uk.ac.lshtm.keppel.cli.subject

data class Subject(val id: String, val templates: List<String>)
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package uk.ac.lshtm.keppel.cli.subject

import org.apache.commons.csv.CSVFormat
import org.apache.commons.csv.CSVParser
import java.io.File

object SubjectParser {
@Throws(BadHeaderException::class)
fun parseCsv(file: String): List<Subject> {
val csvParser = CSVParser(
File(file).reader(),
CSVFormat.Builder.create()
.setDelimiter(',')
.setTrim(true)
.setHeader()
.build()
)

if (csvParser.headerNames.isEmpty() || csvParser.headerNames[0] != "id") {
throw BadHeaderException()
}

return csvParser.asSequence().map { row ->
val templates = 1.until(row.size()).map {
row.get("template_$it")
}

Subject(row.get("id"), templates)
}.toList()
}

class BadHeaderException : Exception()
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package uk.ac.lshtm.keppel.cli.subject

import uk.ac.lshtm.keppel.cli.Template
import uk.ac.lshtm.keppel.cli.TemplateFactory
import uk.ac.lshtm.keppel.cli.util.parallelFlatMap
import uk.ac.lshtm.keppel.cli.util.uniquePairs

object SubjectUseCases {

fun findMatches(
subjects: List<Subject>,
templateFactory: TemplateFactory,
threshold: Double,
parallelism: Int? = null,
windowSize: Int? = null
): Iterable<Match> {
return subjects
.map { subject ->
SubjectWithTemplates(
subject.id,
subject.templates.map { templateFactory.getTemplate(it.toByteArray()) }
)
}
.uniquePairs()
.parallelFlatMap(parallelism, windowSize) { pair ->
val scores = pair.first.templates.zip(pair.second.templates).map { (one, two) ->
one.match(two)
}

if (scores.any { it >= threshold }) {
listOf(Match(pair.first.id, pair.second.id, scores))
} else {
emptyList()
}
}
}

data class Match(val id1: String, val id2: String, val scores: List<Double>)
}

private class SubjectWithTemplates(val id: String, val templates: List<Template>)
Loading