-
Notifications
You must be signed in to change notification settings - Fork 21
Description
Hello, thank you for your work.
I observe reasonable semantic segmentation results on SemanticKITTI validation samples, but very poor results on real LiDAR data, even after trying to match scale, density, and voxelization settings.
Summary:
- Model trained on SemanticKITTI achieves good validation performance (mIoU ≈ 0.65).
- Inference on SemanticKITTI validation frames produces reasonable predictions (road/cars etc).
- Inference on my real LiDAR tiles (cropped from a large .ply) is very poor.
Here I add files jsons with all info about samples and also add screenshots.
Firstly, I analyze samples with such script:
# analyze_before_infer.py
import os
import json
import argparse
import numpy as np
import torch
import open3d as o3d
import concerto
from concerto.transform import Compose
BASE_TRANSFORM_CONFIG = [
dict(type="RandomScale", scale=[0.2, 0.2]),
dict(
type="GridSample",
grid_size=0.01,
hash_type="fnv",
mode="train",
return_grid_coord=True,
return_inverse=True,
),
# dict(type="CenterShift", apply_z=False),
dict(type="CenterShift", apply_z=True),
dict(type="NormalizeColor"),
dict(type="ToTensor"),
dict(
type="Collect",
keys=("coord", "grid_coord", "color", "inverse"),
feat_keys=("coord", "color", "normal"),
),
]
def load_kitti_bin(path: str):
pts = np.fromfile(path, dtype=np.float32).reshape(-1, 4)
coord = pts[:, :3]
intensity = pts[:, 3:4]
inten = intensity.copy()
if inten.size > 0:
mn = float(inten.min())
mx = float(inten.max())
denom = max(mx - mn, 1e-6)
inten = (inten - mn) / denom
color = np.repeat(inten, 3, axis=1).astype(np.float32)
normal = np.zeros_like(coord, dtype=np.float32)
return {"coord": coord.astype(np.float32), "color": color, "normal": normal}
def load_ply(path: str):
pcd = o3d.io.read_point_cloud(path)
coord = np.asarray(pcd.points, dtype=np.float32)
color = np.asarray(pcd.colors, dtype=np.float32)
normal = np.asarray(pcd.normals, dtype=np.float32)
if color.size == 0:
color = np.zeros((coord.shape[0], 3), dtype=np.float32)
if normal.size == 0:
normal = np.zeros_like(coord, dtype=np.float32)
if coord.size > 0:
m = np.isfinite(coord).all(axis=1)
coord, color, normal = coord[m], color[m], normal[m]
return {"coord": coord, "color": color, "normal": normal}
def load_point_file(path: str):
ext = os.path.splitext(path)[1].lower()
if ext == ".bin":
return load_kitti_bin(path)
if ext == ".ply":
return load_ply(path)
raise ValueError(f"Unsupported input extension: {ext}")
def save_ply(path: str, coord, color=None):
coord = to_numpy(coord)
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(coord)
if color is not None:
c = to_numpy(color)
if c.dtype != np.float32:
c = c.astype(np.float32)
if c.size > 0 and c.max() > 1.5:
c = c / 255.0
c = np.clip(c, 0.0, 1.0)
pcd.colors = o3d.utility.Vector3dVector(c)
o3d.io.write_point_cloud(path, pcd)
def approx_nn_stats(coord: np.ndarray, sample_n: int = 20000, seed: int = 0):
if coord.shape[0] == 0:
return None
n = coord.shape[0]
rng = np.random.default_rng(seed)
m = min(sample_n, n)
idx = rng.choice(n, size=m, replace=False)
sub = coord[idx]
# brute-ish but ok for 20k with open3d KDTree
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(coord)
kdt = o3d.geometry.KDTreeFlann(pcd)
dists = []
for p in sub:
_, ii, dd = kdt.search_knn_vector_3d(p, 2) # nearest incl self
if len(dd) >= 2:
dists.append(np.sqrt(dd[1]))
if not dists:
return None
d = np.array(dists, dtype=np.float32)
return {
"nn_min": float(np.min(d)),
"nn_med": float(np.median(d)),
"nn_p95": float(np.percentile(d, 95)),
"nn_p99": float(np.percentile(d, 99)),
"nn_max": float(np.max(d)),
"nn_mean": float(np.mean(d)),
"sample_n": int(len(d)),
}
def to_numpy(a):
"""Accept np.ndarray or torch.Tensor and return np.ndarray (cpu)."""
if isinstance(a, torch.Tensor):
return a.detach().cpu().numpy()
return a
def bounds_stats(coord):
coord = to_numpy(coord)
if coord is None or coord.shape[0] == 0:
return None
mn = coord.min(axis=0)
mx = coord.max(axis=0)
rg = mx - mn
ctr = (mx + mn) / 2.0
r = np.linalg.norm(coord, axis=1)
return {
"mins": mn.tolist(),
"maxs": mx.tolist(),
"range": rg.tolist(),
"center": ctr.tolist(),
"r_min": float(r.min()),
"r_med": float(np.median(r)),
"r_p95": float(np.percentile(r, 95)),
"r_max": float(r.max()),
}
def color_stats(color):
color = to_numpy(color)
if color is None or color.shape[0] == 0:
return None
c = color.astype(np.float32, copy=False)
return {
"min": c.min(axis=0).tolist(),
"max": c.max(axis=0).tolist(),
"mean": c.mean(axis=0).tolist(),
"std": c.std(axis=0).tolist(),
"frac_outside_0_1": float(np.mean((c < 0).any(axis=1) | (c > 1).any(axis=1))),
"frac_zero": float(np.mean(np.all(c == 0, axis=1))),
}
def grid_occupancy_probe(coord, grid_size: float):
coord = to_numpy(coord)
if coord is None or coord.shape[0] == 0:
return None
g = float(grid_size)
scaled = coord / g
grid = np.floor(scaled).astype(np.int64)
mn = grid.min(axis=0)
grid = grid - mn
key = np.core.records.fromarrays(grid.T, names="x,y,z", formats="i8,i8,i8")
_, count = np.unique(key, return_counts=True)
count = count.astype(np.int32)
return {
"grid_size": g,
"voxels": int(count.size),
"mean": float(count.mean()),
"med": float(np.median(count)),
"p95": float(np.percentile(count, 95)),
"p99": float(np.percentile(count, 99)),
"max": int(count.max()),
}
def validate_inverse(inverse, n_ds: int):
inv = to_numpy(inverse)
ok = True
info = {
"len": int(inv.shape[0]),
"min": int(inv.min()) if inv.size else None,
"max": int(inv.max()) if inv.size else None,
"n_ds": int(n_ds),
"frac_oob": None,
}
if inv.size:
oob = (inv < 0) | (inv >= n_ds) # numpy bool array
info["frac_oob"] = float(oob.mean())
ok = (info["frac_oob"] == 0.0)
return ok, info
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--input", required=True, type=str)
ap.add_argument("--outdir", default="./preflight_out", type=str)
ap.add_argument("--grid_size", default=0.06, type=float)
ap.add_argument("--mode", default="train", choices=["train", "test"])
ap.add_argument("--save_ply", action="store_true")
ap.add_argument("--seed", type=int, default=46647087)
ap.add_argument("--nn_sample", type=int, default=20000)
ap.add_argument("--no_color", action="store_true")
ap.add_argument("--no_normal", action="store_true")
args = ap.parse_args()
os.makedirs(args.outdir, exist_ok=True)
concerto.utils.set_seed(args.seed)
# ---- load
point = load_point_file(args.input)
if args.no_color:
point["color"] = np.zeros_like(point["coord"], dtype=np.float32)
if args.no_normal:
point["normal"] = np.zeros_like(point["coord"], dtype=np.float32)
report = {"input": args.input, "grid_size": float(args.grid_size), "mode": args.mode}
coord0 = point["coord"]
color0 = point["color"]
report["pre"] = {
"n": int(coord0.shape[0]),
"bounds": bounds_stats(coord0),
"nn": approx_nn_stats(coord0, sample_n=args.nn_sample, seed=args.seed),
"color": color_stats(color0),
"nan_inf_frac": float(np.mean(~np.isfinite(coord0).all(axis=1))) if coord0.size else 0.0,
}
report["pre"]["occupancy_probe"] = grid_occupancy_probe(coord0, args.grid_size)
if args.save_ply:
save_ply(os.path.join(args.outdir, "pre.ply"), coord0, color0)
# ---- build transform = SAME as infer, but override grid_size and mode
cfg = []
for t in BASE_TRANSFORM_CONFIG:
tt = dict(t)
if tt.get("type") == "GridSample":
tt["grid_size"] = float(args.grid_size)
tt["mode"] = args.mode
cfg.append(tt)
transform = Compose(cfg)
# ---- apply transform
out = transform(point)
# NOTE: if mode=test, GridSample returns a list of parts.
if isinstance(out, list):
report["post"] = {"mode_test_parts": len(out)}
# analyze first part as representative
out0 = out[0]
report["post"]["part0_n_ds"] = int(out0["coord"].shape[0])
report["post"]["part0_bounds"] = bounds_stats(out0["coord"])
if "grid_coord" in out0:
gc = to_numpy(out0["grid_coord"])
report["post"]["part0_grid_coord"] = {
"min": gc.min(axis=0).tolist(),
"max": gc.max(axis=0).tolist(),
"shape": list(gc.shape),
}
if "inverse" in out0:
ok, inv_info = validate_inverse(out0["inverse"], int(out0["coord"].shape[0]))
report["post"]["part0_inverse_ok"] = ok
report["post"]["part0_inverse"] = inv_info
if args.save_ply:
save_ply(os.path.join(args.outdir, "post_part0_ds.ply"),
out0["coord"], out0.get("color", None))
else:
coord_ds = out["coord"]
report["post"] = {
"n_ds": int(coord_ds.shape[0]),
"bounds": bounds_stats(coord_ds),
}
if "grid_coord" in out:
gc = to_numpy(out["grid_coord"])
report["post"]["grid_coord"] = {
"min": gc.min(axis=0).tolist(),
"max": gc.max(axis=0).tolist(),
"shape": list(gc.shape),
}
# estimated physical extent implied by grid coord
g = float(args.grid_size)
ext = (gc.max(axis=0) - gc.min(axis=0) + 1).astype(np.float32) * g
report["post"]["grid_extent_m"] = ext.tolist()
if "inverse" in out:
inv = out["inverse"]
ok, inv_info = validate_inverse(inv, int(coord_ds.shape[0]))
report["post"]["inverse_ok"] = ok
report["post"]["inverse"] = inv_info
# extra: save voxel centers PLY (helps see voxel size visually)
if args.save_ply and "grid_coord" in out:
gc = to_numpy(out["grid_coord"]).astype(np.float32)
centers = (gc + 0.5) * float(args.grid_size)
save_ply(os.path.join(args.outdir, "voxel_centers.ply"), centers, None)
if args.save_ply:
save_ply(os.path.join(args.outdir, "post_ds.ply"),
coord_ds, out.get("color", None))
# ---- dump report
rep_path = os.path.join(args.outdir, "report.json")
with open(rep_path, "w") as f:
json.dump(report, f, indent=2)
print(f"[saved] {rep_path}")
# ---- print key highlights for quick scanning
pre = report["pre"]
print("\n=== PRE ===")
print(f"N={pre['n']}")
b = pre["bounds"]
if b:
print("range(dx,dy,dz) =", np.round(np.array(b["range"]), 6).tolist())
print("center =", np.round(np.array(b["center"]), 6).tolist())
print("r_med/p95/max =", b["r_med"], b["r_p95"], b["r_max"])
if pre.get("occupancy_probe"):
o = pre["occupancy_probe"]
print(f"occupancy@grid={o['grid_size']}: voxels={o['voxels']} mean={o['mean']:.2f} p99={o['p99']:.1f} max={o['max']}")
if pre.get("nn"):
nn = pre["nn"]
print(f"nn_med={nn['nn_med']:.6f} nn_p95={nn['nn_p95']:.6f} nn_mean={nn['nn_mean']:.6f}")
print("\n=== POST ===")
post = report["post"]
if "n_ds" in post:
print(f"N_ds={post['n_ds']}")
if post.get("grid_coord"):
gg = post["grid_coord"]
print("grid_coord min/max =", gg["min"], gg["max"])
print("grid_extent_m =", np.round(np.array(post.get("grid_extent_m", [])), 6).tolist())
if "inverse_ok" in post:
print("inverse_ok =", post["inverse_ok"], "frac_oob =", post["inverse"].get("frac_oob"))
else:
print(post)
if __name__ == "__main__":
main()
I add apply_z=True for my sample and set to False, when use KITTI sample.
report.json - Json info about my sample
report.json - Json info about KITTI sample from validation
logs for my sample:
=== PRE ===
N=45388
range(dx,dy,dz) = [14.0, 3.294998, 8.0]
center = [-13290.058594, -172.334503, -11003.033203]
r_med/p95/max = 17255.833984375 17260.4140625 17262.54296875
occupancy@grid=0.01: voxels=45388 mean=1.00 p99=1.0 max=1
nn_med=0.052494 nn_p95=0.062577 nn_mean=0.054039
=== POST ===
N_ds=44223
grid_coord min/max = [0, 0, 0] [280, 66, 160]
grid_extent_m = [2.81, 0.67, 1.61]
inverse_ok = True frac_oob = 0.0
logs for KITTI:
=== PRE ===
N=117482
range(dx,dy,dz) = [158.607651, 60.406651, 6.473855]
center = [0.285553, -14.823313, -0.478673]
r_med/p95/max = 8.882246017456055 24.29372787475586 79.70906829833984
occupancy@grid=0.01: voxels=117021 mean=1.00 p99=1.0 max=2
nn_med=0.030784 nn_p95=0.127395 nn_mean=0.045949
=== POST ===
N_ds=84262
grid_coord min/max = [0, 0, 0] [3172, 1208, 130]
grid_extent_m = [31.73, 12.09, 1.31]
inverse_ok = True frac_oob = 0.0
Note: RandomScale(scale=[0.2,0.2]) is enabled, so post-bounds are in scaled coordinates.
To add more info, I trained model for SemanticKITTI Dataset and achieved good results:
[2025-12-22 20:12:49,875 INFO test.py line 340 42338] Val result: mIoU/mAcc/allAcc 0.6545/0.7329/0.9065
[2025-12-22 20:12:49,875 INFO test.py line 346 42338] Class_0 - car Result: iou/accuracy 0.9592/0.9850
[2025-12-22 20:12:49,875 INFO test.py line 346 42338] Class_1 - bicycle Result: iou/accuracy 0.4861/0.5640
[2025-12-22 20:12:49,875 INFO test.py line 346 42338] Class_2 - motorcycle Result: iou/accuracy 0.6834/0.7572
[2025-12-22 20:12:49,875 INFO test.py line 346 42338] Class_3 - truck Result: iou/accuracy 0.8639/0.9583
[2025-12-22 20:12:49,875 INFO test.py line 346 42338] Class_4 - other-vehicle Result: iou/accuracy 0.6338/0.6886
[2025-12-22 20:12:49,875 INFO test.py line 346 42338] Class_5 - person Result: iou/accuracy 0.7544/0.8516
[2025-12-22 20:12:49,875 INFO test.py line 346 42338] Class_6 - bicyclist Result: iou/accuracy 0.9041/0.9430
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_7 - motorcyclist Result: iou/accuracy 0.0000/0.0000
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_8 - road Result: iou/accuracy 0.9131/0.9593
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_9 - parking Result: iou/accuracy 0.4716/0.5396
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_10 - sidewalk Result: iou/accuracy 0.7510/0.8935
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_11 - other-ground Result: iou/accuracy 0.1161/0.1714
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_12 - building Result: iou/accuracy 0.8886/0.9631
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_13 - fence Result: iou/accuracy 0.5907/0.7776
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_14 - vegetation Result: iou/accuracy 0.8728/0.9267
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_15 - trunk Result: iou/accuracy 0.7112/0.7974
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_16 - terrain Result: iou/accuracy 0.7181/0.8017
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_17 - pole Result: iou/accuracy 0.6307/0.7824
[2025-12-22 20:12:49,876 INFO test.py line 346 42338] Class_18 - traffic-sign Result: iou/accuracy 0.4859/0.5654
[2025-12-22 20:12:49,876 INFO test.py line 354 42338] <<<<<<<<<<<<<<<<<< End Evaluation <<<<<<<<<<<<<<<<<<
Here code for infernce:
# demo/kitti_infer_vis.py
import os
import argparse
import numpy as np
import torch
import torch.nn as nn
import open3d as o3d
import concerto
from concerto.transform import Compose
try:
import flash_attn # noqa
except Exception:
flash_attn = None
device = "cuda" if torch.cuda.is_available() else "cpu"
# ----------------------------
# SemanticKITTI 19-class meta
# ----------------------------
KITTI_VALID_CLASS_IDS = tuple(range(19))
KITTI_CLASS_LABELS = (
"car", "bicycle", "motorcycle", "truck", "other-vehicle",
"person", "bicyclist", "motorcyclist",
"road", "parking", "sidewalk", "other-ground",
"building", "fence", "vegetation", "trunk",
"terrain", "pole", "traffic-sign",
)
KITTI_COLOR_MAP = {
0: (255.0, 0.0, 0.0),
1: (0.0, 255.0, 0.0),
2: (0.0, 0.0, 255.0),
3: (255.0, 255.0, 0.0),
4: (255.0, 0.0, 255.0),
5: (0.0, 255.0, 255.0),
6: (255.0, 128.0, 0.0),
7: (128.0, 0.0, 255.0),
8: (128.0, 128.0, 128.0),
9: (255.0, 192.0, 203.0),
10: (0.0, 128.0, 128.0),
11: (255.0, 215.0, 0.0),
12: (70.0, 130.0, 180.0),
13: (165.0, 42.0, 42.0),
14: (50.0, 205.0, 50.0),
15: (255.0, 99.0, 71.0),
16: (0.0, 100.0, 0.0),
17: (211.0, 211.0, 211.0),
18: (255.0, 255.0, 255.0),
}
CLASS_COLOR = np.array([KITTI_COLOR_MAP[i] for i in KITTI_VALID_CLASS_IDS], dtype=np.float32) / 255.0
# ----------------------------
# ----------------------------
# TRANSFORM_CONFIG = [
# dict(type="RandomScale", scale=[1, 1]),
# dict(
# type="GridSample",
# grid_size=0.06,
# hash_type="fnv",
# mode="train",
# return_grid_coord=True,
# return_inverse=True,
# ),
# dict(type="CenterShift", apply_z=False),
# dict(type="NormalizeColor"),
# dict(type="ToTensor"),
# dict(
# type="Collect",
# keys=("coord", "grid_coord", "color", "inverse"),
# feat_keys=("coord", "color", "normal"),
# ),
# ]
TRANSFORM_CONFIG = [
dict(type="RandomScale", scale=[0.2, 0.2]),
dict(
type="GridSample",
grid_size=0.01,
hash_type="fnv",
mode="train",
return_grid_coord=True,
return_inverse=True,
),
# dict(type="CenterShift", apply_z=False),
dict(type="CenterShift", apply_z=True),
dict(type="NormalizeColor"),
dict(type="ToTensor"),
dict(
type="Collect",
keys=("coord", "grid_coord", "color", "inverse"),
feat_keys=("coord", "color", "normal"),
),
]
# ----------------------------
# SegHead
# ----------------------------
class SegHead(nn.Module):
def __init__(self, in_dim: int, num_classes: int):
super().__init__()
self.seg_head = nn.Linear(in_dim, num_classes)
def forward(self, x):
return self.seg_head(x)
# ----------------------------
# Checkpoint utils
# ----------------------------
def extract_state_dict(ckpt: dict) -> dict:
for k in ["state_dict", "model", "net", "module"]:
if k in ckpt and isinstance(ckpt[k], dict):
return ckpt[k]
return ckpt
def remap_keys(sd: dict) -> dict:
out = {}
for k, v in sd.items():
kk = k
if kk.startswith("module."):
kk = kk[len("module."):]
if kk.startswith("backbone."):
kk = kk[len("backbone."):]
if kk.startswith("e."):
kk = kk[len("e."):]
if kk.startswith("d."):
kk = kk[len("d."):]
out[kk] = v
return out
def split_backbone_and_head(sd: dict):
head = {}
backbone = {}
for k, v in sd.items():
if k.startswith("seg_head."):
head[k[len("seg_head."):]] = v # weight/bias
else:
backbone[k] = v
return backbone, head
def load_backbone_and_head(model, ckpt_path: str, device: str):
ckpt = torch.load(ckpt_path, map_location="cpu")
sd = remap_keys(extract_state_dict(ckpt))
sd_backbone, sd_head = split_backbone_and_head(sd)
incompatible = model.load_state_dict(sd_backbone, strict=False)
missing = list(incompatible.missing_keys)
unexpected = list(incompatible.unexpected_keys)
print(f"[backbone] missing={len(missing)} unexpected={len(unexpected)}")
print("missing[:10] =", missing[:10])
print("unexpected[:10] =", unexpected[:10])
if "weight" not in sd_head:
raise RuntimeError("No seg_head.weight found in checkpoint. ")
num_classes, in_dim = sd_head["weight"].shape
print(f"[seg_head] detected in_dim={in_dim}, num_classes={num_classes}")
seg_head = SegHead(in_dim=in_dim, num_classes=num_classes).to(device)
seg_head.seg_head.weight.data.copy_(sd_head["weight"].to(device))
if "bias" in sd_head:
seg_head.seg_head.bias.data.copy_(sd_head["bias"].to(device))
return seg_head
# ----------------------------
# Data loader for SemanticKITTI .bin
# ----------------------------
def load_kitti_bin(path: str):
# SemanticKITTI velodyne: float32 [x, y, z, intensity]
pts = np.fromfile(path, dtype=np.float32).reshape(-1, 4)
coord = pts[:, :3]
intensity = pts[:, 3:4]
inten = intensity.copy()
if inten.size > 0:
mn = float(inten.min())
mx = float(inten.max())
denom = max(mx - mn, 1e-6)
inten = (inten - mn) / denom
color = np.repeat(inten, 3, axis=1).astype(np.float32)
normal = np.zeros_like(coord, dtype=np.float32)
return {
"coord": coord.astype(np.float32),
"color": color,
"normal": normal,
}
def load_point_file(path: str):
ext = os.path.splitext(path)[1].lower()
if ext == ".bin":
return load_kitti_bin(path)
if ext == ".ply":
pcd = o3d.io.read_point_cloud(path)
coord = np.asarray(pcd.points, dtype=np.float32)
color = np.asarray(pcd.colors, dtype=np.float32)
normal = np.asarray(pcd.normals, dtype=np.float32)
if color.size == 0:
color = np.zeros((coord.shape[0], 3), dtype=np.float32)
if normal.size == 0:
normal = np.zeros_like(coord, dtype=np.float32)
if coord.size > 0:
finite_mask = np.isfinite(coord).all(axis=1)
coord = coord[finite_mask]
color = color[finite_mask]
normal = normal[finite_mask]
return {
"coord": coord,
"color": color,
"normal": normal,
}
raise ValueError(f"Unsupported input extension: {ext}")
def upcast_feat_like_demo(point):
while "pooling_parent" in point:
parent = point.pop("pooling_parent")
inverse = point.pop("pooling_inverse")
parent.feat = torch.cat([parent.feat, point.feat[inverse]], dim=-1)
point = parent
return point
def visualize_and_save(coord, pred, outdir, show: bool, prefix: str = "pred"):
os.makedirs(outdir, exist_ok=True)
colors = CLASS_COLOR[pred] # (N,3) in 0..1
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(coord)
pcd.colors = o3d.utility.Vector3dVector(colors)
ply_path = os.path.join(outdir, f"{prefix}.ply")
npy_path = os.path.join(outdir, f"{prefix}.npy")
o3d.io.write_point_cloud(ply_path, pcd)
np.save(npy_path, pred)
print(f"[saved] {ply_path}")
print(f"[saved] {npy_path}")
if show:
o3d.visualization.draw_geometries([pcd])
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--ckpt", type=str, required=True, help="Path to model_best.pth (contains seg_head.*)")
parser.add_argument("--input", type=str, required=True, help="SemanticKITTI .bin path")
parser.add_argument("--outdir", type=str, required=True)
parser.add_argument("--grid_size", type=float, default=0.05)
parser.add_argument("--show", action="store_true")
parser.add_argument("--wo_color", action="store_true")
parser.add_argument("--wo_normal", action="store_true")
args = parser.parse_args()
concerto.utils.set_seed(46647087)
if flash_attn is not None:
model = concerto.load("concerto_large_outdoor", repo_id="Pointcept/Concerto").to(device)
else:
custom_config = dict(enc_patch_size=[1024 for _ in range(5)], enable_flash=False)
model = concerto.load("concerto_large_outdoor", repo_id="Pointcept/Concerto", custom_config=custom_config).to(device)
print(f"Model params: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")
# 2) load seg head weights from your ckpt + load backbone weights into this model
seg_head = load_backbone_and_head(model, args.ckpt, device)
model.eval()
seg_head.eval()
# 3) load data
point = load_point_file(args.input)
if args.wo_color:
point["color"] = np.zeros_like(point["coord"], dtype=np.float32)
if args.wo_normal:
point["normal"] = np.zeros_like(point["coord"], dtype=np.float32)
original_coord = point["coord"].copy()
# 4) transform
for t in TRANSFORM_CONFIG:
if t.get("type") == "GridSample":
t["grid_size"] = float(args.grid_size)
transform = Compose(TRANSFORM_CONFIG)
point = transform(point)
# 5) inference
with torch.inference_mode():
for k in list(point.keys()):
if isinstance(point[k], torch.Tensor) and device == "cuda":
point[k] = point[k].cuda(non_blocking=True)
point = model(point)
point = upcast_feat_like_demo(point)
logits = seg_head(point.feat) # (N_ds, 19)
pred_ds = logits.argmax(dim=-1) # (N_ds,)
pred = pred_ds[point.inverse].cpu().numpy().astype(np.int32) # (N_orig,)
coord_ds = point.coord.cpu().numpy()
print(f"Segmentation done. N={pred.shape[0]}")
uniq = np.unique(pred)
print("Predicted classes:", uniq.tolist())
# optional: print labels
for c in uniq[:10]:
if 0 <= int(c) < len(KITTI_CLASS_LABELS):
print(f" {int(c)} -> {KITTI_CLASS_LABELS[int(c)]}")
# 6) visualize/save
visualize_and_save(coord_ds, pred_ds.cpu().numpy().astype(np.int32), args.outdir, args.show, prefix="pred_ds")
visualize_and_save(original_coord, pred, args.outdir, args.show, prefix="pred")
if __name__ == "__main__":
main()
Also here I set apply_z to True/False depends on sample, and here screenshots
for semantic Kitti:
At least road and cars classes are segmented okay.
And here my sample:
Questions:
- Are there recommended preprocessing steps for real LiDAR / large-map .ply tiles (e.g., coordinate normalization, clipping, height alignment, intensity/color handling)?
- Could my transform order/settings be incorrect for inference (especially RandomScale / GridSample / CenterShift(apply_z=...))?
- Would you recommend inference using the repo “test_cfg voxelize” pipeline (mode=test) rather than GridSample(mode=train)?
If you need any additional info, I can provide it (e.g., a small .ply tile, or raw coordinate/color statistics).