FluidNumerics
diff --git a/‎CHANGELOG.md‎
Lines changed: 39 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎src/xfer/cli.py‎
Lines changed: 133 additions & 1 deletion b/‎src/xfer/cli.py‎
Lines changed: 133 additions & 1 deletion
diff --git a/‎src/xfer/slackbot/claude_agent.py‎
Lines changed: 4 additions & 1 deletion b/‎src/xfer/slackbot/claude_agent.py‎
Lines changed: 4 additions & 1 deletion
@@ -0,0 +1,39 @@
+# Changelog
+
+## Unreleased (feature/slack-claude)
+
+### Parallel Manifest Build
+
+- Parallelize manifest generation into up to 4 concurrent `rclone lsjson` workers, reducing listing time for large datasets with many top-level subdirectories (`slurm_tools.py`)
+- Add `manifest combine` CLI command to merge parallel lsjson part files (with `.prefix` sidecars) into a unified `manifest.jsonl` (`cli.py`)
+- Bump prepare job memory from 16 GB to 250 GB to accommodate large listings (`slurm_tools.py`)
+- Add `--max-backlog=1000000` to `rclone lsjson` calls to prevent the walker from stalling on large buckets (`cli.py`, `slurm_tools.py`)
+- Report manifest build progress (files listed, bytes listed) via `manifest.jsonl.progress` sidecar file (`cli.py`)
+- Track prepare job phases (`listing_source`, `combining_manifest`, `analyzing`, `sharding`, `rendering`, `submitting`) in `progress.json` (`slurm_tools.py`)
+
+### Claude-Powered Slack Bot
+
+- Add Claude-powered Slack bot for interactive data transfer requests via Slack threads
+- Add intelligent rclone flag selection based on file size distribution analysis
+- Add `check_path_exists` tool to validate source paths before submitting jobs
+- Add `list_buckets` tool to enumerate buckets at remote endpoints
+- Add `read_job_logs` tool to access job analysis data, prepare logs, and shard transfer logs
+- Add lightweight Haiku triage to filter thread messages and skip unrelated chatter
+- Add per-user job ownership so only the submitting user can cancel their jobs
+- Restore thread context from Slack API after bot restarts
+- Report manifest listing progress (files_listed, bytes_listed) in job status during `building_manifest` phase
+
+### Slurm Robustness
+
+- Increase prepare job time limit to 4 days for very large datasets
+- Unset conflicting `SLURM_MEM_*` environment variables in prepare.sh
+- Add `--export=NONE` to all `sbatch` calls to prevent environment leakage
+- Allow users to set lower array concurrency (max 64)
+- Verify source path exists before creating run directory and submitting jobs
+- Reduce thread history limits to mitigate Slack rate limits
+
+### Bug Fixes
+
+- Fix `run_id` format to be filename-safe (no colons)
+- Fix markdown rendering in Slack responses
+- Improve error logging for manifest build failures (write to `xfer-err/` with full context)
@@ -246,6 +246,7 @@ def manifest_build(
         rclone_cmd += shlex.split(extra_lsjson_flags)
 
     rclone_cmd.append("--files-only")
+    rclone_cmd.append("--max-backlog=1000000")
 
     srun_cmd = ["srun", "-n", "1", "-c", "8", "--no-container-remap-root"]
     srun_cmd += pyxis_container_args(
@@ -301,9 +302,22 @@ def manifest_build(
             eprint(f"SLURM memory env vars: {slurm_mem_vars}")
         raise
 
-    # Build JSONL
+    # Build JSONL with progress reporting
     n = 0
     bytes_total = 0
+    progress_file = out.parent / "manifest.jsonl.progress"
+    last_progress_n = 0
+    PROGRESS_INTERVAL = 10_000  # update progress file every 10k files
+
+    def _write_progress() -> None:
+        """Write current listing progress to a sidecar file."""
+        try:
+            progress_file.write_text(
+                json.dumps({"files_listed": n, "bytes_listed": bytes_total})
+            )
+        except OSError:
+            pass
+
     with out.open("w", encoding="utf-8") as f:
         for item in parse_lsjson_items(cp.stdout):
             # Skip directories
@@ -345,6 +359,13 @@ def manifest_build(
             f.write(json.dumps(rec, separators=(",", ":")) + "\n")
             n += 1
 
+            if n - last_progress_n >= PROGRESS_INTERVAL:
+                _write_progress()
+                eprint(f"  manifest progress: {n:,} files, {bytes_total:,} bytes")
+                last_progress_n = n
+
+    # Final progress update and cleanup
+    _write_progress()
     eprint(f"Wrote {n} items, {bytes_total} bytes -> {out}")
 
 
@@ -515,6 +536,117 @@ def manifest_analyze(
         print(json_output)
 
 
+@manifest_app.command("combine")
+def manifest_combine(
+    source: str = typer.Option(
+        ..., help="rclone source root, e.g. s3src:bucket/prefix"
+    ),
+    dest: str = typer.Option(..., help="rclone dest root, e.g. s3dst:bucket/prefix"),
+    parts_dir: Path = typer.Option(
+        ..., exists=True, help="Directory containing lsjson-*.json part files", resolve_path=True
+    ),
+    out: Path = typer.Option(..., help="Output manifest JSONL path", resolve_path=True),
+    run_id: Optional[str] = typer.Option(
+        None, help="Run identifier; default is generated"
+    ),
+) -> None:
+    """
+    Combine multiple lsjson part files into a unified manifest.jsonl.
+
+    Reads lsjson-*.json files from --parts-dir, adjusts paths using .prefix
+    sidecar files, and writes a single manifest JSONL.
+    """
+    run_id = run_id or now_run_id()
+    mkdirp(out.parent)
+
+    # Glob part files
+    part_files = sorted(parts_dir.glob("lsjson-*.json"))
+    if not part_files:
+        eprint(f"No lsjson-*.json files found in {parts_dir}")
+        raise typer.Exit(code=2)
+
+    n = 0
+    bytes_total = 0
+    last_progress_n = 0
+    PROGRESS_INTERVAL = 10_000
+    progress_file = out.parent / "manifest.jsonl.progress"
+
+    def _write_progress() -> None:
+        try:
+            progress_file.write_text(
+                json.dumps({"files_listed": n, "bytes_listed": bytes_total})
+            )
+        except OSError:
+            pass
+
+    with out.open("w", encoding="utf-8") as f:
+        for part_file in part_files:
+            # Determine prefix from sidecar file
+            prefix_file = part_file.with_suffix(".prefix")
+            prefix = ""
+            if prefix_file.exists():
+                prefix = prefix_file.read_text(encoding="utf-8").strip()
+
+            # Read the JSON array
+            try:
+                items = json.loads(part_file.read_text(encoding="utf-8"))
+                if not isinstance(items, list):
+                    eprint(f"WARNING: {part_file} is not a JSON array, skipping")
+                    continue
+            except (json.JSONDecodeError, OSError) as e:
+                eprint(f"WARNING: Failed to read {part_file}: {e}, skipping")
+                continue
+
+            for item in items:
+                if not isinstance(item, dict):
+                    continue
+                if item.get("IsDir") is True:
+                    continue
+
+                rel_path = item.get("Path")
+                if not rel_path or not isinstance(rel_path, str):
+                    continue
+
+                # Adjust path with prefix
+                if prefix:
+                    rel_path = prefix + "/" + rel_path
+
+                size = int(item.get("Size") or 0)
+                bytes_total += size
+
+                mtime = item.get("ModTime")
+                hashes = item.get("Hashes") if isinstance(item.get("Hashes"), dict) else {}
+                etag = item.get("ETag") or item.get("etag")
+                storage_class = item.get("StorageClass")
+                meta = item.get("Metadata") if isinstance(item.get("Metadata"), dict) else {}
+
+                rec = {
+                    "schema": SCHEMA,
+                    "run_id": run_id,
+                    "source_root": source,
+                    "dest_root": dest,
+                    "source": source.rstrip("/") + "/" + rel_path,
+                    "dest": stable_dest_for_source(source, dest, rel_path),
+                    "path": rel_path,
+                    "size": size,
+                    "mtime": mtime,
+                    "hashes": hashes,
+                    "etag": etag,
+                    "storage_class": storage_class,
+                    "meta": meta,
+                }
+                f.write(json.dumps(rec, separators=(",", ":")) + "\n")
+                n += 1
+
+                if n - last_progress_n >= PROGRESS_INTERVAL:
+                    _write_progress()
+                    eprint(f"  manifest progress: {n:,} files, {bytes_total:,} bytes")
+                    last_progress_n = n
+
+    _write_progress()
+    eprint(f"Combined {len(part_files)} parts -> {n} items, {bytes_total} bytes -> {out}")
+
+
 # -----------------------------
 # Slurm render/submit
 # -----------------------------
 
@@ -84,7 +84,9 @@
         "name": "check_status",
         "description": """Check the status of transfer jobs in this thread. Use this when the user asks about job status, progress, or wants to know if their transfer is complete.
 
-This tool finds all jobs associated with the current Slack thread and returns their status.""",
+This tool finds all jobs associated with the current Slack thread and returns their status.
+
+When the phase is "building_manifest", the prepare job is listing files at the source. This can take up to several days for large datasets and is normal. The response may include files_listed and bytes_listed if the JSONL writing phase has started, or prepare_phase/prepare_detail for finer-grained progress. Only flag a concern if the job has been in this phase for more than 48 hours with no observable progress.""",
         "input_schema": {
             "type": "object",
             "properties": {
@@ -292,6 +294,7 @@
 6. When reporting job status, include relevant details like progress and any errors
 7. If users want custom rclone flags (e.g., bandwidth limits, checksum verification), pass them via the rclone_flags parameter
 8. When a user reports a problem or asks you to investigate an issue with a transfer, ALWAYS use read_job_logs to examine the actual log files. The shard logs contain the real error messages from rclone and the transfer process. Do not guess at causes without reading the logs first.
+9. The preparation phase (manifest building) can take a long time for large datasets — up to several days is normal. The prepare job has a 4-day time limit. Only consider it a potential problem if the prepare job has been running for more than 48 hours with no progress. If the status includes files_listed or bytes_listed, report those numbers to the user so they can see listing is progressing. If those numbers are absent, the rclone listing is still running (it returns results all at once when complete).
 
 Transfer path format:
 - Paths should be in rclone format: "remote:bucket/path"