Skip to content

Poor inference on real LiDAR vs good on SemanticKITTI, despite matching voxelization stats (scale/density) #8

@TOOFACK

Description

@TOOFACK

Hello, thank you for your work.

I observe reasonable semantic segmentation results on SemanticKITTI validation samples, but very poor results on real LiDAR data, even after trying to match scale, density, and voxelization settings.

Summary:

  • Model trained on SemanticKITTI achieves good validation performance (mIoU ≈ 0.65).
  • Inference on SemanticKITTI validation frames produces reasonable predictions (road/cars etc).
  • Inference on my real LiDAR tiles (cropped from a large .ply) is very poor.

Here I add files jsons with all info about samples and also add screenshots.

Firstly, I analyze samples with such script:

# analyze_before_infer.py
import os
import json
import argparse
import numpy as np
import torch
import open3d as o3d

import concerto
from concerto.transform import Compose


BASE_TRANSFORM_CONFIG = [
    dict(type="RandomScale", scale=[0.2, 0.2]),
    dict(
        type="GridSample",
        grid_size=0.01,
        hash_type="fnv",
        mode="train",
        return_grid_coord=True,
        return_inverse=True,
    ),
    # dict(type="CenterShift", apply_z=False),
    dict(type="CenterShift", apply_z=True),

    dict(type="NormalizeColor"),
    dict(type="ToTensor"),
    dict(
        type="Collect",
        keys=("coord", "grid_coord", "color", "inverse"),
        feat_keys=("coord", "color", "normal"),
    ),
]


def load_kitti_bin(path: str):
    pts = np.fromfile(path, dtype=np.float32).reshape(-1, 4)
    coord = pts[:, :3]
    intensity = pts[:, 3:4]

    inten = intensity.copy()
    if inten.size > 0:
        mn = float(inten.min())
        mx = float(inten.max())
        denom = max(mx - mn, 1e-6)
        inten = (inten - mn) / denom
    color = np.repeat(inten, 3, axis=1).astype(np.float32)

    normal = np.zeros_like(coord, dtype=np.float32)
    return {"coord": coord.astype(np.float32), "color": color, "normal": normal}


def load_ply(path: str):
    pcd = o3d.io.read_point_cloud(path)
    coord = np.asarray(pcd.points, dtype=np.float32)
    color = np.asarray(pcd.colors, dtype=np.float32)
    normal = np.asarray(pcd.normals, dtype=np.float32)

    if color.size == 0:
        color = np.zeros((coord.shape[0], 3), dtype=np.float32)
    if normal.size == 0:
        normal = np.zeros_like(coord, dtype=np.float32)

    if coord.size > 0:
        m = np.isfinite(coord).all(axis=1)
        coord, color, normal = coord[m], color[m], normal[m]

    return {"coord": coord, "color": color, "normal": normal}


def load_point_file(path: str):
    ext = os.path.splitext(path)[1].lower()
    if ext == ".bin":
        return load_kitti_bin(path)
    if ext == ".ply":
        return load_ply(path)
    raise ValueError(f"Unsupported input extension: {ext}")


def save_ply(path: str, coord, color=None):
    coord = to_numpy(coord)

    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(coord)

    if color is not None:
        c = to_numpy(color)

        if c.dtype != np.float32:
            c = c.astype(np.float32)

        if c.size > 0 and c.max() > 1.5:
            c = c / 255.0

        c = np.clip(c, 0.0, 1.0)
        pcd.colors = o3d.utility.Vector3dVector(c)

    o3d.io.write_point_cloud(path, pcd)



def approx_nn_stats(coord: np.ndarray, sample_n: int = 20000, seed: int = 0):
    if coord.shape[0] == 0:
        return None
    n = coord.shape[0]
    rng = np.random.default_rng(seed)
    m = min(sample_n, n)
    idx = rng.choice(n, size=m, replace=False)
    sub = coord[idx]

    # brute-ish but ok for 20k with open3d KDTree
    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(coord)
    kdt = o3d.geometry.KDTreeFlann(pcd)

    dists = []
    for p in sub:
        _, ii, dd = kdt.search_knn_vector_3d(p, 2)  # nearest incl self
        if len(dd) >= 2:
            dists.append(np.sqrt(dd[1]))
    if not dists:
        return None
    d = np.array(dists, dtype=np.float32)
    return {
        "nn_min": float(np.min(d)),
        "nn_med": float(np.median(d)),
        "nn_p95": float(np.percentile(d, 95)),
        "nn_p99": float(np.percentile(d, 99)),
        "nn_max": float(np.max(d)),
        "nn_mean": float(np.mean(d)),
        "sample_n": int(len(d)),
    }


def to_numpy(a):
    """Accept np.ndarray or torch.Tensor and return np.ndarray (cpu)."""
    if isinstance(a, torch.Tensor):
        return a.detach().cpu().numpy()
    return a


def bounds_stats(coord):
    coord = to_numpy(coord)
    if coord is None or coord.shape[0] == 0:
        return None
    mn = coord.min(axis=0)
    mx = coord.max(axis=0)
    rg = mx - mn
    ctr = (mx + mn) / 2.0
    r = np.linalg.norm(coord, axis=1)
    return {
        "mins": mn.tolist(),
        "maxs": mx.tolist(),
        "range": rg.tolist(),
        "center": ctr.tolist(),
        "r_min": float(r.min()),
        "r_med": float(np.median(r)),
        "r_p95": float(np.percentile(r, 95)),
        "r_max": float(r.max()),
    }


def color_stats(color):
    color = to_numpy(color)
    if color is None or color.shape[0] == 0:
        return None
    c = color.astype(np.float32, copy=False)
    return {
        "min": c.min(axis=0).tolist(),
        "max": c.max(axis=0).tolist(),
        "mean": c.mean(axis=0).tolist(),
        "std": c.std(axis=0).tolist(),
        "frac_outside_0_1": float(np.mean((c < 0).any(axis=1) | (c > 1).any(axis=1))),
        "frac_zero": float(np.mean(np.all(c == 0, axis=1))),
    }


def grid_occupancy_probe(coord, grid_size: float):
    coord = to_numpy(coord)
    if coord is None or coord.shape[0] == 0:
        return None
    g = float(grid_size)
    scaled = coord / g
    grid = np.floor(scaled).astype(np.int64)

    mn = grid.min(axis=0)
    grid = grid - mn

    key = np.core.records.fromarrays(grid.T, names="x,y,z", formats="i8,i8,i8")
    _, count = np.unique(key, return_counts=True)
    count = count.astype(np.int32)

    return {
        "grid_size": g,
        "voxels": int(count.size),
        "mean": float(count.mean()),
        "med": float(np.median(count)),
        "p95": float(np.percentile(count, 95)),
        "p99": float(np.percentile(count, 99)),
        "max": int(count.max()),
    }


def validate_inverse(inverse, n_ds: int):
    inv = to_numpy(inverse)

    ok = True
    info = {
        "len": int(inv.shape[0]),
        "min": int(inv.min()) if inv.size else None,
        "max": int(inv.max()) if inv.size else None,
        "n_ds": int(n_ds),
        "frac_oob": None,
    }

    if inv.size:
        oob = (inv < 0) | (inv >= n_ds)   # numpy bool array
        info["frac_oob"] = float(oob.mean())
        ok = (info["frac_oob"] == 0.0)

    return ok, info


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--input", required=True, type=str)
    ap.add_argument("--outdir", default="./preflight_out", type=str)
    ap.add_argument("--grid_size", default=0.06, type=float)
    ap.add_argument("--mode", default="train", choices=["train", "test"])
    ap.add_argument("--save_ply", action="store_true")
    ap.add_argument("--seed", type=int, default=46647087)
    ap.add_argument("--nn_sample", type=int, default=20000)
    ap.add_argument("--no_color", action="store_true")
    ap.add_argument("--no_normal", action="store_true")
    args = ap.parse_args()

    os.makedirs(args.outdir, exist_ok=True)
    concerto.utils.set_seed(args.seed)

    # ---- load
    point = load_point_file(args.input)
    if args.no_color:
        point["color"] = np.zeros_like(point["coord"], dtype=np.float32)
    if args.no_normal:
        point["normal"] = np.zeros_like(point["coord"], dtype=np.float32)

    report = {"input": args.input, "grid_size": float(args.grid_size), "mode": args.mode}

    coord0 = point["coord"]
    color0 = point["color"]

    report["pre"] = {
        "n": int(coord0.shape[0]),
        "bounds": bounds_stats(coord0),
        "nn": approx_nn_stats(coord0, sample_n=args.nn_sample, seed=args.seed),
        "color": color_stats(color0),
        "nan_inf_frac": float(np.mean(~np.isfinite(coord0).all(axis=1))) if coord0.size else 0.0,
    }
    report["pre"]["occupancy_probe"] = grid_occupancy_probe(coord0, args.grid_size)

    if args.save_ply:
        save_ply(os.path.join(args.outdir, "pre.ply"), coord0, color0)

    # ---- build transform = SAME as infer, but override grid_size and mode
    cfg = []
    for t in BASE_TRANSFORM_CONFIG:
        tt = dict(t)
        if tt.get("type") == "GridSample":
            tt["grid_size"] = float(args.grid_size)
            tt["mode"] = args.mode
        cfg.append(tt)

    transform = Compose(cfg)

    # ---- apply transform
    out = transform(point)

    # NOTE: if mode=test, GridSample returns a list of parts.
    if isinstance(out, list):
        report["post"] = {"mode_test_parts": len(out)}
        # analyze first part as representative
        out0 = out[0]
        report["post"]["part0_n_ds"] = int(out0["coord"].shape[0])
        report["post"]["part0_bounds"] = bounds_stats(out0["coord"])
        if "grid_coord" in out0:
            gc = to_numpy(out0["grid_coord"])
            report["post"]["part0_grid_coord"] = {
                "min": gc.min(axis=0).tolist(),
                "max": gc.max(axis=0).tolist(),
                "shape": list(gc.shape),
            }
        if "inverse" in out0:
            ok, inv_info = validate_inverse(out0["inverse"], int(out0["coord"].shape[0]))
            report["post"]["part0_inverse_ok"] = ok
            report["post"]["part0_inverse"] = inv_info

        if args.save_ply:
            save_ply(os.path.join(args.outdir, "post_part0_ds.ply"),
                     out0["coord"], out0.get("color", None))
    else:
        coord_ds = out["coord"]
        report["post"] = {
            "n_ds": int(coord_ds.shape[0]),
            "bounds": bounds_stats(coord_ds),
        }
        if "grid_coord" in out:
            gc = to_numpy(out["grid_coord"])
            report["post"]["grid_coord"] = {
                "min": gc.min(axis=0).tolist(),
                "max": gc.max(axis=0).tolist(),
                "shape": list(gc.shape),
            }
            # estimated physical extent implied by grid coord
            g = float(args.grid_size)
            ext = (gc.max(axis=0) - gc.min(axis=0) + 1).astype(np.float32) * g
            report["post"]["grid_extent_m"] = ext.tolist()

        if "inverse" in out:
            inv = out["inverse"]
            ok, inv_info = validate_inverse(inv, int(coord_ds.shape[0]))
            report["post"]["inverse_ok"] = ok
            report["post"]["inverse"] = inv_info

        # extra: save voxel centers PLY (helps see voxel size visually)
        if args.save_ply and "grid_coord" in out:
            gc = to_numpy(out["grid_coord"]).astype(np.float32)
            centers = (gc + 0.5) * float(args.grid_size)
            save_ply(os.path.join(args.outdir, "voxel_centers.ply"), centers, None)


        if args.save_ply:
            save_ply(os.path.join(args.outdir, "post_ds.ply"),
                     coord_ds, out.get("color", None))

    # ---- dump report
    rep_path = os.path.join(args.outdir, "report.json")
    with open(rep_path, "w") as f:
        json.dump(report, f, indent=2)
    print(f"[saved] {rep_path}")

    # ---- print key highlights for quick scanning
    pre = report["pre"]
    print("\n=== PRE ===")
    print(f"N={pre['n']}")
    b = pre["bounds"]
    if b:
        print("range(dx,dy,dz) =", np.round(np.array(b["range"]), 6).tolist())
        print("center =", np.round(np.array(b["center"]), 6).tolist())
        print("r_med/p95/max =", b["r_med"], b["r_p95"], b["r_max"])
    if pre.get("occupancy_probe"):
        o = pre["occupancy_probe"]
        print(f"occupancy@grid={o['grid_size']}: voxels={o['voxels']} mean={o['mean']:.2f} p99={o['p99']:.1f} max={o['max']}")
    if pre.get("nn"):
        nn = pre["nn"]
        print(f"nn_med={nn['nn_med']:.6f} nn_p95={nn['nn_p95']:.6f} nn_mean={nn['nn_mean']:.6f}")

    print("\n=== POST ===")
    post = report["post"]
    if "n_ds" in post:
        print(f"N_ds={post['n_ds']}")
        if post.get("grid_coord"):
            gg = post["grid_coord"]
            print("grid_coord min/max =", gg["min"], gg["max"])
            print("grid_extent_m =", np.round(np.array(post.get("grid_extent_m", [])), 6).tolist())
        if "inverse_ok" in post:
            print("inverse_ok =", post["inverse_ok"], "frac_oob =", post["inverse"].get("frac_oob"))
    else:
        print(post)




if __name__ == "__main__":
    main()

I add apply_z=True for my sample and set to False, when use KITTI sample.

report.json - Json info about my sample

report.json - Json info about KITTI sample from validation

logs for my sample:

=== PRE ===
N=45388
range(dx,dy,dz) = [14.0, 3.294998, 8.0]
center = [-13290.058594, -172.334503, -11003.033203]
r_med/p95/max = 17255.833984375 17260.4140625 17262.54296875
occupancy@grid=0.01: voxels=45388 mean=1.00 p99=1.0 max=1
nn_med=0.052494 nn_p95=0.062577 nn_mean=0.054039

=== POST ===
N_ds=44223
grid_coord min/max = [0, 0, 0] [280, 66, 160]
grid_extent_m = [2.81, 0.67, 1.61]
inverse_ok = True frac_oob = 0.0

logs for KITTI:

=== PRE ===
N=117482
range(dx,dy,dz) = [158.607651, 60.406651, 6.473855]
center = [0.285553, -14.823313, -0.478673]
r_med/p95/max = 8.882246017456055 24.29372787475586 79.70906829833984
occupancy@grid=0.01: voxels=117021 mean=1.00 p99=1.0 max=2
nn_med=0.030784 nn_p95=0.127395 nn_mean=0.045949

=== POST ===
N_ds=84262
grid_coord min/max = [0, 0, 0] [3172, 1208, 130]
grid_extent_m = [31.73, 12.09, 1.31]
inverse_ok = True frac_oob = 0.0

Note: RandomScale(scale=[0.2,0.2]) is enabled, so post-bounds are in scaled coordinates.

To add more info, I trained model for SemanticKITTI Dataset and achieved good results:

[2025-12-22 20:12:49,875 INFO test.py line 340 42338] Val result: mIoU/mAcc/allAcc 0.6545/0.7329/0.9065
[2025-12-22 20:12:49,875 INFO test.py line 346 42338] Class_0 - car Result: iou/accuracy 0.9592/0.9850
[2025-12-22 20:12:49,875 INFO test.py line 346 42338] Class_1 - bicycle Result: iou/accuracy 0.4861/0.5640
[2025-12-22 20:12:49,875 INFO test.py line 346 42338] Class_2 - motorcycle Result: iou/accuracy 0.6834/0.7572
[2025-12-22 20:12:49,875 INFO test.py line 346 42338] Class_3 - truck Result: iou/accuracy 0.8639/0.9583
[2025-12-22 20:12:49,875 INFO test.py line 346 42338] Class_4 - other-vehicle Result: iou/accuracy 0.6338/0.6886
[2025-12-22 20:12:49,875 INFO test.py line 346 42338] Class_5 - person Result: iou/accuracy 0.7544/0.8516
[2025-12-22 20:12:49,875 INFO test.py line 346 42338] Class_6 - bicyclist Result: iou/accuracy 0.9041/0.9430
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_7 - motorcyclist Result: iou/accuracy 0.0000/0.0000
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_8 - road Result: iou/accuracy 0.9131/0.9593
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_9 - parking Result: iou/accuracy 0.4716/0.5396
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_10 - sidewalk Result: iou/accuracy 0.7510/0.8935
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_11 - other-ground Result: iou/accuracy 0.1161/0.1714
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_12 - building Result: iou/accuracy 0.8886/0.9631
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_13 - fence Result: iou/accuracy 0.5907/0.7776
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_14 - vegetation Result: iou/accuracy 0.8728/0.9267
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_15 - trunk Result: iou/accuracy 0.7112/0.7974
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_16 - terrain Result: iou/accuracy 0.7181/0.8017
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_17 - pole Result: iou/accuracy 0.6307/0.7824
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_18 - traffic-sign Result: iou/accuracy 0.4859/0.5654
[2025-12-22 20:12:49,876 INFO test.py line 354 42338] <<<<<<<<<<<<<<<<<< End Evaluation <<<<<<<<<<<<<<<<<<

Here code for infernce:

# demo/kitti_infer_vis.py
import os
import argparse
import numpy as np
import torch
import torch.nn as nn
import open3d as o3d

import concerto
from concerto.transform import Compose

try:
    import flash_attn  # noqa
except Exception:
    flash_attn = None

device = "cuda" if torch.cuda.is_available() else "cpu"

# ----------------------------
# SemanticKITTI 19-class meta
# ----------------------------
KITTI_VALID_CLASS_IDS = tuple(range(19))
KITTI_CLASS_LABELS = (
    "car", "bicycle", "motorcycle", "truck", "other-vehicle",
    "person", "bicyclist", "motorcyclist",
    "road", "parking", "sidewalk", "other-ground",
    "building", "fence", "vegetation", "trunk",
    "terrain", "pole", "traffic-sign",
)

KITTI_COLOR_MAP = {
    0: (255.0, 0.0, 0.0),
    1: (0.0, 255.0, 0.0),
    2: (0.0, 0.0, 255.0),
    3: (255.0, 255.0, 0.0),
    4: (255.0, 0.0, 255.0),
    5: (0.0, 255.0, 255.0),
    6: (255.0, 128.0, 0.0),
    7: (128.0, 0.0, 255.0),
    8: (128.0, 128.0, 128.0),
    9: (255.0, 192.0, 203.0),
    10: (0.0, 128.0, 128.0),
    11: (255.0, 215.0, 0.0),
    12: (70.0, 130.0, 180.0),
    13: (165.0, 42.0, 42.0),
    14: (50.0, 205.0, 50.0),
    15: (255.0, 99.0, 71.0),
    16: (0.0, 100.0, 0.0),
    17: (211.0, 211.0, 211.0),
    18: (255.0, 255.0, 255.0),
}
CLASS_COLOR = np.array([KITTI_COLOR_MAP[i] for i in KITTI_VALID_CLASS_IDS], dtype=np.float32) / 255.0


# ----------------------------

# ----------------------------
# TRANSFORM_CONFIG = [
#     dict(type="RandomScale", scale=[1, 1]),
#     dict(
#         type="GridSample",
#         grid_size=0.06,
#         hash_type="fnv",
#         mode="train",
#         return_grid_coord=True,
#         return_inverse=True,
#     ),
#     dict(type="CenterShift", apply_z=False),
#     dict(type="NormalizeColor"),
#     dict(type="ToTensor"),
#     dict(
#         type="Collect",
#         keys=("coord", "grid_coord", "color", "inverse"),
#         feat_keys=("coord", "color", "normal"),
#     ),
# ]

TRANSFORM_CONFIG = [
    dict(type="RandomScale", scale=[0.2, 0.2]),
    dict(
        type="GridSample",
        grid_size=0.01,
        hash_type="fnv",
        mode="train",
        return_grid_coord=True,
        return_inverse=True,
    ),
    # dict(type="CenterShift", apply_z=False),
    dict(type="CenterShift", apply_z=True),

    dict(type="NormalizeColor"),
    dict(type="ToTensor"),
    dict(
        type="Collect",
        keys=("coord", "grid_coord", "color", "inverse"),
        feat_keys=("coord", "color", "normal"),
    ),
]


# ----------------------------
# SegHead
# ----------------------------
class SegHead(nn.Module):
    def __init__(self, in_dim: int, num_classes: int):
        super().__init__()
        self.seg_head = nn.Linear(in_dim, num_classes)

    def forward(self, x):
        return self.seg_head(x)


# ----------------------------
# Checkpoint utils
# ----------------------------
def extract_state_dict(ckpt: dict) -> dict:
    for k in ["state_dict", "model", "net", "module"]:
        if k in ckpt and isinstance(ckpt[k], dict):
            return ckpt[k]
    return ckpt

def remap_keys(sd: dict) -> dict:
    out = {}
    for k, v in sd.items():
        kk = k
        if kk.startswith("module."):
            kk = kk[len("module."):]
        if kk.startswith("backbone."):
            kk = kk[len("backbone."):]
        if kk.startswith("e."):
            kk = kk[len("e."):]
        if kk.startswith("d."):
            kk = kk[len("d."):]
        out[kk] = v
    return out

def split_backbone_and_head(sd: dict):
    head = {}
    backbone = {}
    for k, v in sd.items():
        if k.startswith("seg_head."):
            head[k[len("seg_head."):]] = v  # weight/bias
        else:
            backbone[k] = v
    return backbone, head

def load_backbone_and_head(model, ckpt_path: str, device: str):
    ckpt = torch.load(ckpt_path, map_location="cpu")
    sd = remap_keys(extract_state_dict(ckpt))
    sd_backbone, sd_head = split_backbone_and_head(sd)

    incompatible = model.load_state_dict(sd_backbone, strict=False)
    missing = list(incompatible.missing_keys)
    unexpected = list(incompatible.unexpected_keys)
    print(f"[backbone] missing={len(missing)} unexpected={len(unexpected)}")
    print("missing[:10] =", missing[:10])
    print("unexpected[:10] =", unexpected[:10])

    if "weight" not in sd_head:
        raise RuntimeError("No seg_head.weight found in checkpoint. ")

    num_classes, in_dim = sd_head["weight"].shape
    print(f"[seg_head] detected in_dim={in_dim}, num_classes={num_classes}")

    seg_head = SegHead(in_dim=in_dim, num_classes=num_classes).to(device)
    seg_head.seg_head.weight.data.copy_(sd_head["weight"].to(device))
    if "bias" in sd_head:
        seg_head.seg_head.bias.data.copy_(sd_head["bias"].to(device))
    return seg_head


# ----------------------------
# Data loader for SemanticKITTI .bin
# ----------------------------
def load_kitti_bin(path: str):
    # SemanticKITTI velodyne: float32 [x, y, z, intensity]
    pts = np.fromfile(path, dtype=np.float32).reshape(-1, 4)
    coord = pts[:, :3]
    intensity = pts[:, 3:4]

 
    inten = intensity.copy()
    if inten.size > 0:
        mn = float(inten.min())
        mx = float(inten.max())
        denom = max(mx - mn, 1e-6)
        inten = (inten - mn) / denom
    color = np.repeat(inten, 3, axis=1).astype(np.float32)

    normal = np.zeros_like(coord, dtype=np.float32)
    return {
        "coord": coord.astype(np.float32),
        "color": color,
        "normal": normal,
    }


def load_point_file(path: str):
    ext = os.path.splitext(path)[1].lower()
    if ext == ".bin":
        return load_kitti_bin(path)
    if ext == ".ply":
        pcd = o3d.io.read_point_cloud(path)
        coord = np.asarray(pcd.points, dtype=np.float32)
        color = np.asarray(pcd.colors, dtype=np.float32)
        normal = np.asarray(pcd.normals, dtype=np.float32)

        if color.size == 0:
            color = np.zeros((coord.shape[0], 3), dtype=np.float32)
        if normal.size == 0:
            normal = np.zeros_like(coord, dtype=np.float32)

        if coord.size > 0:
            finite_mask = np.isfinite(coord).all(axis=1)
            coord = coord[finite_mask]
            color = color[finite_mask]
            normal = normal[finite_mask]

        return {
            "coord": coord,
            "color": color,
            "normal": normal,
        }
    raise ValueError(f"Unsupported input extension: {ext}")


def upcast_feat_like_demo(point):

    while "pooling_parent" in point:
        parent = point.pop("pooling_parent")
        inverse = point.pop("pooling_inverse")
        parent.feat = torch.cat([parent.feat, point.feat[inverse]], dim=-1)
        point = parent
    return point


def visualize_and_save(coord, pred, outdir, show: bool, prefix: str = "pred"):
    os.makedirs(outdir, exist_ok=True)

    colors = CLASS_COLOR[pred]  # (N,3) in 0..1

    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(coord)
    pcd.colors = o3d.utility.Vector3dVector(colors)

    ply_path = os.path.join(outdir, f"{prefix}.ply")
    npy_path = os.path.join(outdir, f"{prefix}.npy")
    o3d.io.write_point_cloud(ply_path, pcd)
    np.save(npy_path, pred)

    print(f"[saved] {ply_path}")
    print(f"[saved] {npy_path}")
    if show:
        o3d.visualization.draw_geometries([pcd])


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--ckpt", type=str, required=True, help="Path to model_best.pth (contains seg_head.*)")
    parser.add_argument("--input", type=str, required=True, help="SemanticKITTI .bin path")
    parser.add_argument("--outdir", type=str, required=True)
    parser.add_argument("--grid_size", type=float, default=0.05)
    parser.add_argument("--show", action="store_true")
    parser.add_argument("--wo_color", action="store_true")
    parser.add_argument("--wo_normal", action="store_true")

    args = parser.parse_args()

    concerto.utils.set_seed(46647087)


    if flash_attn is not None:
        model = concerto.load("concerto_large_outdoor", repo_id="Pointcept/Concerto").to(device)
    else:
        custom_config = dict(enc_patch_size=[1024 for _ in range(5)], enable_flash=False)
        model = concerto.load("concerto_large_outdoor", repo_id="Pointcept/Concerto", custom_config=custom_config).to(device)

    print(f"Model params: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")

    # 2) load seg head weights from your ckpt + load backbone weights into this model
    seg_head = load_backbone_and_head(model, args.ckpt, device)

    model.eval()
    seg_head.eval()

    # 3) load data
    point = load_point_file(args.input)
    if args.wo_color:
        point["color"] = np.zeros_like(point["coord"], dtype=np.float32)
    if args.wo_normal:
        point["normal"] = np.zeros_like(point["coord"], dtype=np.float32)

    original_coord = point["coord"].copy()

    # 4) transform
    for t in TRANSFORM_CONFIG:
        if t.get("type") == "GridSample":
            t["grid_size"] = float(args.grid_size)

    transform = Compose(TRANSFORM_CONFIG)
    point = transform(point)

    # 5) inference
    with torch.inference_mode():
        for k in list(point.keys()):
            if isinstance(point[k], torch.Tensor) and device == "cuda":
                point[k] = point[k].cuda(non_blocking=True)

        point = model(point)
        point = upcast_feat_like_demo(point)

        logits = seg_head(point.feat)            # (N_ds, 19)
        pred_ds = logits.argmax(dim=-1)          # (N_ds,)
        pred = pred_ds[point.inverse].cpu().numpy().astype(np.int32)  # (N_orig,)
        coord_ds = point.coord.cpu().numpy()

    print(f"Segmentation done. N={pred.shape[0]}")
    uniq = np.unique(pred)
    print("Predicted classes:", uniq.tolist())
    # optional: print labels
    for c in uniq[:10]:
        if 0 <= int(c) < len(KITTI_CLASS_LABELS):
            print(f"  {int(c)} -> {KITTI_CLASS_LABELS[int(c)]}")

    # 6) visualize/save
    visualize_and_save(coord_ds, pred_ds.cpu().numpy().astype(np.int32), args.outdir, args.show, prefix="pred_ds")
    visualize_and_save(original_coord, pred, args.outdir, args.show, prefix="pred")


if __name__ == "__main__":
    main()

Also here I set apply_z to True/False depends on sample, and here screenshots
for semantic Kitti:

Image

At least road and cars classes are segmented okay.

And here my sample:

Image Image

Questions:

  1. Are there recommended preprocessing steps for real LiDAR / large-map .ply tiles (e.g., coordinate normalization, clipping, height alignment, intensity/color handling)?
  2. Could my transform order/settings be incorrect for inference (especially RandomScale / GridSample / CenterShift(apply_z=...))?
  3. Would you recommend inference using the repo “test_cfg voxelize” pipeline (mode=test) rather than GridSample(mode=train)?

If you need any additional info, I can provide it (e.g., a small .ply tile, or raw coordinate/color statistics).

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions