Skip to content

Commit c798dfa

Browse files
fix(tika): corregge rilevamento JAR Tika con supporto versioni e env var TIKA_JAR_PATH
1 parent ead786b commit c798dfa

File tree

4 files changed

+83
-21
lines changed

4 files changed

+83
-21
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ and the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.
1212
- Ollama context size reduced from 8192 to 2048 to match nomic-embed-text model limit
1313
- Centralized logging configuration for consistent pipeline logs visibility
1414
- UI now shows actual error messages instead of generic failure messages
15+
- Tika JAR detection now supports versioned filenames and TIKA_JAR_PATH env var
16+
- Improved Tika pre-download in Dockerfile.tika with verification and copy to expected path
1517

1618
## [1.1.2] - 2025-12-02
1719

Dockerfile.tika

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
4141
RUN ln -s /usr/lib/jvm/java-21-openjdk-* /usr/lib/jvm/java-21
4242
ENV JAVA_HOME=/usr/lib/jvm/java-21
4343
ENV PATH="${JAVA_HOME}/bin:${PATH}"
44+
# Tika JAR path (set after pre-download, used by lib/tika_check.py)
45+
ENV TIKA_JAR_PATH=/tmp/tika-server.jar
4446

4547
# Install Ollama
4648
RUN curl -fsSL https://ollama.ai/install.sh | sh
@@ -81,8 +83,23 @@ RUN ollama serve & \
8183
ollama pull nomic-embed-text && \
8284
pkill ollama || true
8385

84-
# Pre-download Tika JAR (triggers auto-download to /tmp/tika-server.jar)
85-
RUN python -c "from tika import parser; parser.from_buffer('test', xmlContent=False)" || true
86+
# Pre-download Tika JAR and ensure it's in the expected location
87+
RUN python3 -c "\
88+
import os, shutil; \
89+
from tika import parser, tika; \
90+
# Trigger download \
91+
parser.from_buffer(b'test', xmlContent=False); \
92+
# Get the actual JAR path \
93+
jar_path = getattr(tika, 'TikaJarPath', None); \
94+
print(f'Tika JAR downloaded to: {jar_path}'); \
95+
# Copy to expected location if different \
96+
if jar_path and os.path.exists(jar_path) and jar_path != '/tmp/tika-server.jar': \
97+
shutil.copy2(jar_path, '/tmp/tika-server.jar'); \
98+
print('Copied to /tmp/tika-server.jar'); \
99+
# Verify \
100+
assert os.path.exists('/tmp/tika-server.jar'), 'Tika JAR not found at /tmp/tika-server.jar'; \
101+
print('Tika JAR verified at /tmp/tika-server.jar'); \
102+
" || echo "WARNING: Tika JAR pre-download failed, will download at runtime"
86103

87104
# Create data directory for Qdrant
88105
RUN mkdir -p /data/qdrant

api/routes/upload.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -115,15 +115,19 @@ def run_indexing(job_id: str, collection_dir: Path, collection: str, filenames:
115115
# Import RagifyPipeline
116116
from ragify import RagifyPipeline
117117
from lib.config import RagifyConfig
118-
from lib.tika_check import is_tika_available
118+
from lib.tika_check import is_tika_available, check_tika_available
119119

120120
# Configure
121121
config = RagifyConfig.default()
122122
config.qdrant.collection = collection
123123

124-
# Check Tika availability
125-
use_tika = is_tika_available()
126-
logger.info(f"[{job_id}] Tika available: {use_tika}")
124+
# Check Tika availability with diagnostic info
125+
tika_status = check_tika_available()
126+
use_tika = tika_status['can_use_tika']
127+
logger.info(f"[{job_id}] Tika status: java={tika_status['java_installed']}, "
128+
f"jar={tika_status['tika_jar_available']}, path={tika_status.get('tika_jar_path')}")
129+
if tika_status['issues']:
130+
logger.warning(f"[{job_id}] Tika issues: {tika_status['issues']}")
127131

128132
jobs[job_id]["progress"] = 0.2
129133
jobs[job_id]["message"] = f"Processing with {'Tika' if use_tika else 'text-only'} mode"

lib/tika_check.py

Lines changed: 54 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -49,23 +49,41 @@ def check_tika_jar_available() -> Tuple[bool, Optional[Path]]:
4949
Returns:
5050
Tuple of (is_available, jar_path)
5151
"""
52-
# Tika downloads to /tmp on Unix systems
53-
possible_paths = [
52+
# Check environment variable first (for Docker containers)
53+
env_path = os.getenv('TIKA_JAR_PATH')
54+
if env_path:
55+
jar_path = Path(env_path)
56+
if jar_path.exists():
57+
logger.debug(f"Tika JAR found via TIKA_JAR_PATH: {jar_path}")
58+
return True, jar_path
59+
60+
# Static paths to check
61+
static_paths = [
5462
Path('/tmp/tika-server.jar'),
55-
Path('/var/folders').rglob('tika-server.jar'), # macOS temp
5663
Path.home() / '.tika' / 'tika-server.jar',
5764
]
5865

59-
for path in possible_paths:
60-
if isinstance(path, Path) and path.exists():
66+
for path in static_paths:
67+
if path.exists():
68+
logger.debug(f"Tika JAR found at: {path}")
6169
return True, path
62-
# For glob results
63-
try:
64-
for found_path in path:
65-
if found_path.exists():
66-
return True, found_path
67-
except (TypeError, AttributeError):
68-
continue
70+
71+
# Glob patterns for versioned JARs (tika-server-X.Y.Z.jar)
72+
glob_patterns = [
73+
(Path('/tmp'), 'tika-server*.jar'),
74+
(Path.home() / '.tika', 'tika-server*.jar'),
75+
(Path('/var/folders'), '**/tika-server*.jar'), # macOS temp
76+
]
77+
78+
for base_path, pattern in glob_patterns:
79+
if base_path.exists():
80+
try:
81+
for found_path in base_path.glob(pattern):
82+
if found_path.exists() and found_path.is_file():
83+
logger.debug(f"Tika JAR found via glob: {found_path}")
84+
return True, found_path
85+
except (PermissionError, OSError):
86+
continue
6987

7088
return False, None
7189

@@ -239,13 +257,34 @@ def ensure_tika_ready(interactive: bool = True, auto_skip: bool = False) -> bool
239257

240258
def is_tika_available() -> bool:
241259
"""
242-
Simple check if Tika is available for use.
260+
Check if Tika is available for use.
261+
262+
First checks via check_tika_available(), then falls back to trying
263+
tika-python directly in case the JAR is in an unexpected location.
243264
244265
Returns:
245-
True if Java and Tika JAR are available, False otherwise.
266+
True if Tika can be used, False otherwise.
246267
"""
268+
# First try standard check
247269
status = check_tika_available()
248-
return status['can_use_tika']
270+
if status['can_use_tika']:
271+
return True
272+
273+
# Fallback: try tika-python directly (it knows its own JAR location)
274+
if status['java_installed']:
275+
try:
276+
from tika import tika
277+
# Check if tika-python has a valid JAR path
278+
jar_path = getattr(tika, 'TikaJarPath', None)
279+
if jar_path and Path(jar_path).exists():
280+
logger.info(f"Tika JAR found via tika-python: {jar_path}")
281+
return True
282+
except ImportError:
283+
pass
284+
except Exception as e:
285+
logger.debug(f"Tika-python fallback check failed: {e}")
286+
287+
return False
249288

250289

251290
def print_tika_status():

0 commit comments

Comments
 (0)