Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions apps/backend/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,4 @@ mvn spring-boot:run
- El pipeline narrativo incluye normalizacion de texto, memoria de entidades, grafo de relaciones y nivel cognitivo por escena.
- Persistencia docente sobre JDBC + Flyway (`classrooms`, `students`, `assignments`, `attempts`).
- Default local con H2 file DB; PostgreSQL habilitado por variables de entorno Spring datasource.
- Importacion de libros restringida a `.txt` y `.pdf` con limite configurable (`app.import.max-bytes`, default 25MB).
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,6 @@ public BookCatalogService bookCatalogService(AppConfig config) {

@Bean
public BookImportService bookImportService(AppConfig config) {
return new BookImportService(config.booksDir());
return new BookImportService(config.booksDir(), config.maxImportBytes());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ public record AppConfig(
Path dataDir,
Path saveFile,
Path booksDir,
long maxImportBytes,
int sceneMaxChars,
int sceneLinesPerChunk
) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ public AppConfig load() {
Path dataDir = Path.of(properties.getProperty("app.data.dir", ".autobook-data")).toAbsolutePath().normalize();
Path saveFile = dataDir.resolve(properties.getProperty("app.save.file", "savegame.properties"));
Path booksDir = Path.of(properties.getProperty("app.books.dir", "books")).toAbsolutePath().normalize();
long maxImportBytes = Long.parseLong(properties.getProperty("app.import.max-bytes", "26214400"));

int maxChars = Integer.parseInt(properties.getProperty("app.scene.max.chars", "420"));
int linesPerChunk = Integer.parseInt(properties.getProperty("app.scene.lines-per-chunk", "4"));
Expand All @@ -30,6 +31,7 @@ public AppConfig load() {
dataDir,
saveFile,
booksDir,
maxImportBytes,
maxChars,
linesPerChunk
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,33 @@
public class BookImportService {

private final Path booksDir;
private final long maxImportBytes;

public BookImportService(Path booksDir) {
this(booksDir, 25L * 1024 * 1024);
}

public BookImportService(Path booksDir, long maxImportBytes) {
this.booksDir = booksDir;
this.maxImportBytes = Math.max(1024, maxImportBytes);
}

public BookAsset importFromInput(String rawPath) {
Path sourcePath = parsePath(rawPath);
if (!Files.exists(sourcePath) || !Files.isRegularFile(sourcePath)) {
throw new IllegalArgumentException("No existe el archivo indicado: " + sourcePath);
}
if (!Files.isReadable(sourcePath)) {
throw new IllegalArgumentException("No se puede leer el archivo indicado.");
}
String format = detectFormat(sourcePath);
validateFileSize(sourcePath);

try {
Files.createDirectories(booksDir);
String cleanName = sourcePath.getFileName().toString().replaceAll("\\s+", "-").toLowerCase();
String cleanName = sanitizeName(sourcePath.getFileName().toString(), format);
Path destination = booksDir.resolve(cleanName);
Files.copy(sourcePath, destination, StandardCopyOption.REPLACE_EXISTING);
String format = destination.getFileName().toString().toLowerCase().endsWith(".pdf") ? "pdf" : "txt";
return new BookAsset(destination.getFileName().toString(), destination.toAbsolutePath().normalize(), format);
} catch (IOException e) {
throw new IllegalStateException("No se pudo importar el libro", e);
Expand All @@ -41,6 +51,47 @@ Path parsePath(String rawPath) {
if (input.startsWith("file:///")) {
return Path.of(URI.create(input)).toAbsolutePath().normalize();
}
if (input.startsWith("file://")) {
return Path.of(URI.create(input)).toAbsolutePath().normalize();
}
return Path.of(input).toAbsolutePath().normalize();
}

private String detectFormat(Path sourcePath) {
String lower = sourcePath.getFileName().toString().toLowerCase();
if (lower.endsWith(".txt")) {
return "txt";
}
if (lower.endsWith(".pdf")) {
return "pdf";
}
throw new IllegalArgumentException("Formato no soportado. Usa .txt o .pdf");
}

private void validateFileSize(Path sourcePath) {
try {
long size = Files.size(sourcePath);
if (size <= 0) {
throw new IllegalArgumentException("El archivo esta vacio.");
}
if (size > maxImportBytes) {
throw new IllegalArgumentException("El archivo excede el limite permitido (" + maxImportBytes + " bytes).");
}
} catch (IOException e) {
throw new IllegalArgumentException("No se pudo validar el tamano del archivo.", e);
}
}

private String sanitizeName(String originalName, String format) {
String baseName = originalName.replaceAll("\\.[^.]+$", "");
String safeBase = baseName
.toLowerCase()
.replaceAll("[^a-z0-9-_]+", "-")
.replaceAll("-{2,}", "-")
.replaceAll("(^-+|-+$)", "");
if (safeBase.isBlank()) {
safeBase = "book-import";
}
return safeBase + "." + format;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
public class BookTextNormalizer {

private static final Pattern PAGE_NUMBER_LINE = Pattern.compile("^(?:p(?:ag(?:ina)?)?\\.?\\s*)?[0-9ivxlcdm]{1,8}$", Pattern.CASE_INSENSITIVE);
private static final Pattern TOC_DOTTED_LINE = Pattern.compile("^[\\p{L}\\p{N} ,;:()'\"-]{4,}\\.{2,}\\s*[0-9ivxlcdm]{1,8}$", Pattern.CASE_INSENSITIVE);
private static final Pattern MULTI_SPACE = Pattern.compile("\\s+");

public String normalize(String rawText) {
Expand All @@ -22,6 +23,7 @@ public String normalize(String rawText) {

String normalizedLineBreaks = rawText
.replace("\uFEFF", "")
.replace("\u00AD", "")
.replace("\r\n", "\n")
.replace('\r', '\n');

Expand All @@ -31,7 +33,7 @@ public String normalize(String rawText) {
List<String> resultLines = new ArrayList<>();
for (int p = 0; p < pages.size(); p++) {
List<String> filtered = filterNoise(pages.get(p), profile);
List<String> merged = mergeHyphenatedBreaks(filtered);
List<String> merged = mergeWrappedLines(mergeHyphenatedBreaks(filtered));
if (!resultLines.isEmpty()) {
resultLines.add("");
}
Expand Down Expand Up @@ -93,6 +95,12 @@ private List<String> filterNoise(List<String> page, HeaderFooterProfile profile)
if (PAGE_NUMBER_LINE.matcher(line).matches()) {
continue;
}
if (TOC_DOTTED_LINE.matcher(line).matches()) {
continue;
}
if (looksLikeTableOfContents(line)) {
continue;
}
if (i == indexOfFirstNonBlank(page)
&& profile.headerCount().getOrDefault(headerSig, 0) >= profile.minRepetitions()) {
continue;
Expand Down Expand Up @@ -130,6 +138,35 @@ private List<String> mergeHyphenatedBreaks(List<String> lines) {
return merged;
}

private List<String> mergeWrappedLines(List<String> lines) {
List<String> merged = new ArrayList<>();
int i = 0;
while (i < lines.size()) {
String current = lines.get(i);
if (current.isBlank()) {
merged.add("");
i++;
continue;
}
StringBuilder paragraph = new StringBuilder(current);
int cursor = i + 1;
while (cursor < lines.size()) {
String next = lines.get(cursor);
if (next.isBlank()) {
break;
}
if (!shouldJoin(paragraph.toString(), next)) {
break;
}
paragraph.append(' ').append(next);
cursor++;
}
merged.add(paragraph.toString());
i = cursor;
}
return merged;
}

private String collapseBlankRuns(List<String> lines) {
StringBuilder out = new StringBuilder();
boolean lastBlank = false;
Expand Down Expand Up @@ -162,6 +199,31 @@ private boolean startsWithLowercase(String line) {
return Character.isLetter(codePoint) && Character.isLowerCase(codePoint);
}

private boolean shouldJoin(String previous, String next) {
if (previous.endsWith(".")
|| previous.endsWith("!")
|| previous.endsWith("?")
|| previous.endsWith(":")
|| previous.endsWith(";")) {
return false;
}
if (next.startsWith("- ")
|| next.matches("^[0-9]+[\\).].*")
|| next.matches("^[ivxlcdm]+[\\).].*")) {
return false;
}
return startsWithLowercase(next) || previous.length() >= 45;
}

private boolean looksLikeTableOfContents(String line) {
String normalized = line.toLowerCase(Locale.ROOT);
return (normalized.startsWith("capitulo ")
|| normalized.startsWith("chapter ")
|| normalized.startsWith("seccion ")
|| normalized.startsWith("parte "))
&& normalized.matches(".*\\b[0-9ivxlcdm]{1,8}$");
}

private boolean isHeaderFooterCandidate(String line) {
if (line == null || line.isBlank()) {
return false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,18 @@ public class PdfTextExtractor implements DocumentTextExtractor {
public String extract(Path path) {
try (PDDocument document = Loader.loadPDF(path.toFile())) {
PDFTextStripper stripper = new PDFTextStripper();
return stripper.getText(document);
stripper.setSortByPosition(true);
StringBuilder allText = new StringBuilder();
int totalPages = Math.max(1, document.getNumberOfPages());
for (int page = 1; page <= totalPages; page++) {
stripper.setStartPage(page);
stripper.setEndPage(page);
allText.append(stripper.getText(document));
if (page < totalPages) {
allText.append('\f');
}
}
return allText.toString();
} catch (IOException e) {
throw new IllegalArgumentException("No se pudo leer PDF: " + path, e);
}
Expand Down
1 change: 1 addition & 0 deletions apps/backend/src/main/resources/application.properties
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ app.name=AutoBook Adventure
app.data.dir=.autobook-data
app.save.file=savegame.properties
app.books.dir=books
app.import.max-bytes=26214400
app.scene.max.chars=420
app.scene.lines-per-chunk=4

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@

import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;

import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;

class BookImportServiceTest {
Expand All @@ -24,4 +26,24 @@ void shouldImportFromFileUri(@TempDir Path tempDir) throws Exception {
assertTrue(Files.exists(asset.path()));
assertTrue(asset.title().endsWith(".txt"));
}

@Test
void shouldRejectUnsupportedExtension(@TempDir Path tempDir) throws Exception {
Path source = tempDir.resolve("sample.docx");
Files.writeString(source, "contenido");
BookImportService service = new BookImportService(tempDir.resolve("catalog"), 1024 * 1024);

assertThrows(IllegalArgumentException.class, () -> service.importFromInput(source.toString()));
}

@Test
void shouldRejectFilesAboveMaxImportSize(@TempDir Path tempDir) throws Exception {
Path source = tempDir.resolve("big-book.txt");
byte[] payload = new byte[3000];
Arrays.fill(payload, (byte) 'a');
Files.write(source, payload);
BookImportService service = new BookImportService(tempDir.resolve("catalog"), 1024);

assertThrows(IllegalArgumentException.class, () -> service.importFromInput(source.toString()));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,22 @@ void shouldMergeHyphenatedLineBreaks() {

assertTrue(clean.contains("historia para ninos."));
}

@Test
void shouldRemoveIndexLinesAndJoinWrappedParagraphs() {
String raw = """
Capitulo 1 ............. 9
Capitulo 2 ............. 15
El caballero miro el castillo
y decidio continuar con cautela
porque la noche era fria.
""";

String clean = normalizer.normalize(raw);

assertFalse(clean.contains("Capitulo 1"));
assertFalse(clean.contains("Capitulo 2"));
assertTrue(clean.contains("El caballero miro el castillo y decidio continuar con cautela porque la noche era fria."));
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,25 @@ void shouldExtractTextFromPdf(@TempDir Path tempDir) throws Exception {
stream.showText("El caballero aprende a escuchar su corazon.");
stream.endText();
}
PDPage page2 = new PDPage(PDRectangle.LETTER);
document.addPage(page2);
try (PDPageContentStream stream = new PDPageContentStream(document, page2)) {
stream.beginText();
stream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 12);
stream.newLineAtOffset(100, 700);
stream.showText("En la segunda pagina reflexiona y continua.");
stream.endText();
}
document.save(pdf.toFile());
}

PdfTextExtractor extractor = new PdfTextExtractor();
String text = extractor.extract(pdf).toLowerCase();
String text = extractor.extract(pdf);
String normalized = text.toLowerCase();

assertTrue(text.contains("caballero"));
assertTrue(text.contains("corazon"));
assertTrue(normalized.contains("caballero"));
assertTrue(normalized.contains("corazon"));
assertTrue(normalized.contains("segunda pagina"));
assertTrue(text.contains("\f"));
}
}
1 change: 1 addition & 0 deletions apps/backend/src/test/resources/application.properties
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ app.name=AutoBook Adventure Test
app.data.dir=target/test-data
app.save.file=savegame.properties
app.books.dir=target/test-books
app.import.max-bytes=1048576
app.scene.max.chars=420
app.scene.lines-per-chunk=4

Expand Down
2 changes: 1 addition & 1 deletion docs/NEXT_STEPS.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
- Agregar estrategia de backup/restore y retencion.

2. Seguridad y robustez
- Limitar tamano y tipo de archivos importados.
- Endurecer validaciones de import (MIME real + escaneo anti-archivo malicioso).
- Validar rutas y endurecer manejo de errores.
- Agregar rate limiting basico por IP/sesion.

Expand Down