Skip to content

Commit b22f043

Browse files
committed
feat(ingest): improve book import validation and text normalization
1 parent 1dced6b commit b22f043

File tree

13 files changed

+190
-9
lines changed

13 files changed

+190
-9
lines changed

apps/backend/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,4 @@ mvn spring-boot:run
4545
- El pipeline narrativo incluye normalizacion de texto, memoria de entidades, grafo de relaciones y nivel cognitivo por escena.
4646
- Persistencia docente sobre JDBC + Flyway (`classrooms`, `students`, `assignments`, `attempts`).
4747
- Default local con H2 file DB; PostgreSQL habilitado por variables de entorno Spring datasource.
48+
- Importacion de libros restringida a `.txt` y `.pdf` con limite configurable (`app.import.max-bytes`, default 25MB).

apps/backend/src/main/java/com/juegodefinitivo/autobook/config/AppBeansConfig.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,6 @@ public BookCatalogService bookCatalogService(AppConfig config) {
6868

6969
@Bean
7070
public BookImportService bookImportService(AppConfig config) {
71-
return new BookImportService(config.booksDir());
71+
return new BookImportService(config.booksDir(), config.maxImportBytes());
7272
}
7373
}

apps/backend/src/main/java/com/juegodefinitivo/autobook/config/AppConfig.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ public record AppConfig(
77
Path dataDir,
88
Path saveFile,
99
Path booksDir,
10+
long maxImportBytes,
1011
int sceneMaxChars,
1112
int sceneLinesPerChunk
1213
) {

apps/backend/src/main/java/com/juegodefinitivo/autobook/config/ConfigLoader.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ public AppConfig load() {
2121
Path dataDir = Path.of(properties.getProperty("app.data.dir", ".autobook-data")).toAbsolutePath().normalize();
2222
Path saveFile = dataDir.resolve(properties.getProperty("app.save.file", "savegame.properties"));
2323
Path booksDir = Path.of(properties.getProperty("app.books.dir", "books")).toAbsolutePath().normalize();
24+
long maxImportBytes = Long.parseLong(properties.getProperty("app.import.max-bytes", "26214400"));
2425

2526
int maxChars = Integer.parseInt(properties.getProperty("app.scene.max.chars", "420"));
2627
int linesPerChunk = Integer.parseInt(properties.getProperty("app.scene.lines-per-chunk", "4"));
@@ -30,6 +31,7 @@ public AppConfig load() {
3031
dataDir,
3132
saveFile,
3233
booksDir,
34+
maxImportBytes,
3335
maxChars,
3436
linesPerChunk
3537
);

apps/backend/src/main/java/com/juegodefinitivo/autobook/ingest/BookImportService.java

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,23 +9,33 @@
99
public class BookImportService {
1010

1111
private final Path booksDir;
12+
private final long maxImportBytes;
1213

1314
public BookImportService(Path booksDir) {
15+
this(booksDir, 25L * 1024 * 1024);
16+
}
17+
18+
public BookImportService(Path booksDir, long maxImportBytes) {
1419
this.booksDir = booksDir;
20+
this.maxImportBytes = Math.max(1024, maxImportBytes);
1521
}
1622

1723
public BookAsset importFromInput(String rawPath) {
1824
Path sourcePath = parsePath(rawPath);
1925
if (!Files.exists(sourcePath) || !Files.isRegularFile(sourcePath)) {
2026
throw new IllegalArgumentException("No existe el archivo indicado: " + sourcePath);
2127
}
28+
if (!Files.isReadable(sourcePath)) {
29+
throw new IllegalArgumentException("No se puede leer el archivo indicado.");
30+
}
31+
String format = detectFormat(sourcePath);
32+
validateFileSize(sourcePath);
2233

2334
try {
2435
Files.createDirectories(booksDir);
25-
String cleanName = sourcePath.getFileName().toString().replaceAll("\\s+", "-").toLowerCase();
36+
String cleanName = sanitizeName(sourcePath.getFileName().toString(), format);
2637
Path destination = booksDir.resolve(cleanName);
2738
Files.copy(sourcePath, destination, StandardCopyOption.REPLACE_EXISTING);
28-
String format = destination.getFileName().toString().toLowerCase().endsWith(".pdf") ? "pdf" : "txt";
2939
return new BookAsset(destination.getFileName().toString(), destination.toAbsolutePath().normalize(), format);
3040
} catch (IOException e) {
3141
throw new IllegalStateException("No se pudo importar el libro", e);
@@ -41,6 +51,47 @@ Path parsePath(String rawPath) {
4151
if (input.startsWith("file:///")) {
4252
return Path.of(URI.create(input)).toAbsolutePath().normalize();
4353
}
54+
if (input.startsWith("file://")) {
55+
return Path.of(URI.create(input)).toAbsolutePath().normalize();
56+
}
4457
return Path.of(input).toAbsolutePath().normalize();
4558
}
59+
60+
private String detectFormat(Path sourcePath) {
61+
String lower = sourcePath.getFileName().toString().toLowerCase();
62+
if (lower.endsWith(".txt")) {
63+
return "txt";
64+
}
65+
if (lower.endsWith(".pdf")) {
66+
return "pdf";
67+
}
68+
throw new IllegalArgumentException("Formato no soportado. Usa .txt o .pdf");
69+
}
70+
71+
private void validateFileSize(Path sourcePath) {
72+
try {
73+
long size = Files.size(sourcePath);
74+
if (size <= 0) {
75+
throw new IllegalArgumentException("El archivo esta vacio.");
76+
}
77+
if (size > maxImportBytes) {
78+
throw new IllegalArgumentException("El archivo excede el limite permitido (" + maxImportBytes + " bytes).");
79+
}
80+
} catch (IOException e) {
81+
throw new IllegalArgumentException("No se pudo validar el tamano del archivo.", e);
82+
}
83+
}
84+
85+
private String sanitizeName(String originalName, String format) {
86+
String baseName = originalName.replaceAll("\\.[^.]+$", "");
87+
String safeBase = baseName
88+
.toLowerCase()
89+
.replaceAll("[^a-z0-9-_]+", "-")
90+
.replaceAll("-{2,}", "-")
91+
.replaceAll("(^-+|-+$)", "");
92+
if (safeBase.isBlank()) {
93+
safeBase = "book-import";
94+
}
95+
return safeBase + "." + format;
96+
}
4697
}

apps/backend/src/main/java/com/juegodefinitivo/autobook/ingest/BookTextNormalizer.java

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
public class BookTextNormalizer {
1414

1515
private static final Pattern PAGE_NUMBER_LINE = Pattern.compile("^(?:p(?:ag(?:ina)?)?\\.?\\s*)?[0-9ivxlcdm]{1,8}$", Pattern.CASE_INSENSITIVE);
16+
private static final Pattern TOC_DOTTED_LINE = Pattern.compile("^[\\p{L}\\p{N} ,;:()'\"-]{4,}\\.{2,}\\s*[0-9ivxlcdm]{1,8}$", Pattern.CASE_INSENSITIVE);
1617
private static final Pattern MULTI_SPACE = Pattern.compile("\\s+");
1718

1819
public String normalize(String rawText) {
@@ -22,6 +23,7 @@ public String normalize(String rawText) {
2223

2324
String normalizedLineBreaks = rawText
2425
.replace("\uFEFF", "")
26+
.replace("\u00AD", "")
2527
.replace("\r\n", "\n")
2628
.replace('\r', '\n');
2729

@@ -31,7 +33,7 @@ public String normalize(String rawText) {
3133
List<String> resultLines = new ArrayList<>();
3234
for (int p = 0; p < pages.size(); p++) {
3335
List<String> filtered = filterNoise(pages.get(p), profile);
34-
List<String> merged = mergeHyphenatedBreaks(filtered);
36+
List<String> merged = mergeWrappedLines(mergeHyphenatedBreaks(filtered));
3537
if (!resultLines.isEmpty()) {
3638
resultLines.add("");
3739
}
@@ -93,6 +95,12 @@ private List<String> filterNoise(List<String> page, HeaderFooterProfile profile)
9395
if (PAGE_NUMBER_LINE.matcher(line).matches()) {
9496
continue;
9597
}
98+
if (TOC_DOTTED_LINE.matcher(line).matches()) {
99+
continue;
100+
}
101+
if (looksLikeTableOfContents(line)) {
102+
continue;
103+
}
96104
if (i == indexOfFirstNonBlank(page)
97105
&& profile.headerCount().getOrDefault(headerSig, 0) >= profile.minRepetitions()) {
98106
continue;
@@ -130,6 +138,35 @@ private List<String> mergeHyphenatedBreaks(List<String> lines) {
130138
return merged;
131139
}
132140

141+
private List<String> mergeWrappedLines(List<String> lines) {
142+
List<String> merged = new ArrayList<>();
143+
int i = 0;
144+
while (i < lines.size()) {
145+
String current = lines.get(i);
146+
if (current.isBlank()) {
147+
merged.add("");
148+
i++;
149+
continue;
150+
}
151+
StringBuilder paragraph = new StringBuilder(current);
152+
int cursor = i + 1;
153+
while (cursor < lines.size()) {
154+
String next = lines.get(cursor);
155+
if (next.isBlank()) {
156+
break;
157+
}
158+
if (!shouldJoin(paragraph.toString(), next)) {
159+
break;
160+
}
161+
paragraph.append(' ').append(next);
162+
cursor++;
163+
}
164+
merged.add(paragraph.toString());
165+
i = cursor;
166+
}
167+
return merged;
168+
}
169+
133170
private String collapseBlankRuns(List<String> lines) {
134171
StringBuilder out = new StringBuilder();
135172
boolean lastBlank = false;
@@ -162,6 +199,31 @@ private boolean startsWithLowercase(String line) {
162199
return Character.isLetter(codePoint) && Character.isLowerCase(codePoint);
163200
}
164201

202+
private boolean shouldJoin(String previous, String next) {
203+
if (previous.endsWith(".")
204+
|| previous.endsWith("!")
205+
|| previous.endsWith("?")
206+
|| previous.endsWith(":")
207+
|| previous.endsWith(";")) {
208+
return false;
209+
}
210+
if (next.startsWith("- ")
211+
|| next.matches("^[0-9]+[\\).].*")
212+
|| next.matches("^[ivxlcdm]+[\\).].*")) {
213+
return false;
214+
}
215+
return startsWithLowercase(next) || previous.length() >= 45;
216+
}
217+
218+
private boolean looksLikeTableOfContents(String line) {
219+
String normalized = line.toLowerCase(Locale.ROOT);
220+
return (normalized.startsWith("capitulo ")
221+
|| normalized.startsWith("chapter ")
222+
|| normalized.startsWith("seccion ")
223+
|| normalized.startsWith("parte "))
224+
&& normalized.matches(".*\\b[0-9ivxlcdm]{1,8}$");
225+
}
226+
165227
private boolean isHeaderFooterCandidate(String line) {
166228
if (line == null || line.isBlank()) {
167229
return false;

apps/backend/src/main/java/com/juegodefinitivo/autobook/ingest/PdfTextExtractor.java

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,18 @@ public class PdfTextExtractor implements DocumentTextExtractor {
1212
public String extract(Path path) {
1313
try (PDDocument document = Loader.loadPDF(path.toFile())) {
1414
PDFTextStripper stripper = new PDFTextStripper();
15-
return stripper.getText(document);
15+
stripper.setSortByPosition(true);
16+
StringBuilder allText = new StringBuilder();
17+
int totalPages = Math.max(1, document.getNumberOfPages());
18+
for (int page = 1; page <= totalPages; page++) {
19+
stripper.setStartPage(page);
20+
stripper.setEndPage(page);
21+
allText.append(stripper.getText(document));
22+
if (page < totalPages) {
23+
allText.append('\f');
24+
}
25+
}
26+
return allText.toString();
1627
} catch (IOException e) {
1728
throw new IllegalArgumentException("No se pudo leer PDF: " + path, e);
1829
}

apps/backend/src/main/resources/application.properties

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ app.name=AutoBook Adventure
22
app.data.dir=.autobook-data
33
app.save.file=savegame.properties
44
app.books.dir=books
5+
app.import.max-bytes=26214400
56
app.scene.max.chars=420
67
app.scene.lines-per-chunk=4
78

apps/backend/src/test/java/com/juegodefinitivo/autobook/BookImportServiceTest.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66

77
import java.nio.file.Files;
88
import java.nio.file.Path;
9+
import java.util.Arrays;
910

11+
import static org.junit.jupiter.api.Assertions.assertThrows;
1012
import static org.junit.jupiter.api.Assertions.assertTrue;
1113

1214
class BookImportServiceTest {
@@ -24,4 +26,24 @@ void shouldImportFromFileUri(@TempDir Path tempDir) throws Exception {
2426
assertTrue(Files.exists(asset.path()));
2527
assertTrue(asset.title().endsWith(".txt"));
2628
}
29+
30+
@Test
31+
void shouldRejectUnsupportedExtension(@TempDir Path tempDir) throws Exception {
32+
Path source = tempDir.resolve("sample.docx");
33+
Files.writeString(source, "contenido");
34+
BookImportService service = new BookImportService(tempDir.resolve("catalog"), 1024 * 1024);
35+
36+
assertThrows(IllegalArgumentException.class, () -> service.importFromInput(source.toString()));
37+
}
38+
39+
@Test
40+
void shouldRejectFilesAboveMaxImportSize(@TempDir Path tempDir) throws Exception {
41+
Path source = tempDir.resolve("big-book.txt");
42+
byte[] payload = new byte[3000];
43+
Arrays.fill(payload, (byte) 'a');
44+
Files.write(source, payload);
45+
BookImportService service = new BookImportService(tempDir.resolve("catalog"), 1024);
46+
47+
assertThrows(IllegalArgumentException.class, () -> service.importFromInput(source.toString()));
48+
}
2749
}

apps/backend/src/test/java/com/juegodefinitivo/autobook/BookTextNormalizerTest.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,5 +47,22 @@ void shouldMergeHyphenatedLineBreaks() {
4747

4848
assertTrue(clean.contains("historia para ninos."));
4949
}
50+
51+
@Test
52+
void shouldRemoveIndexLinesAndJoinWrappedParagraphs() {
53+
String raw = """
54+
Capitulo 1 ............. 9
55+
Capitulo 2 ............. 15
56+
El caballero miro el castillo
57+
y decidio continuar con cautela
58+
porque la noche era fria.
59+
""";
60+
61+
String clean = normalizer.normalize(raw);
62+
63+
assertFalse(clean.contains("Capitulo 1"));
64+
assertFalse(clean.contains("Capitulo 2"));
65+
assertTrue(clean.contains("El caballero miro el castillo y decidio continuar con cautela porque la noche era fria."));
66+
}
5067
}
5168

0 commit comments

Comments
 (0)