Delete convert_to_nuscenes.py
This commit is contained in:
@@ -1,730 +0,0 @@
|
||||
"""
|
||||
Convert private dataset to NuScenes-style .pkl and .coco.json files
|
||||
compatible with MMDetection3D.
|
||||
|
||||
Coordinate system note
|
||||
----------------------
|
||||
Your dataset: X-right, Y-forward, Z-up (same as NuScenes LiDAR frame)
|
||||
-> No rotation is needed for the point cloud.
|
||||
-> lidar == ego == global (identity transforms, no vehicle odometry available)
|
||||
|
||||
Projection matrix note
|
||||
----------------------
|
||||
Your camera_raw stores a 3×4 matrix P = K @ [R | t]
|
||||
where K is the 3×3 intrinsic and [R|t] maps points from the LiDAR frame
|
||||
into the camera frame.
|
||||
We decompose P via RQ-decomposition to recover K (intrinsic) and [R|t]
|
||||
(lidar-to-camera extrinsic).
|
||||
|
||||
Usage
|
||||
-----
|
||||
python convert_to_nuscenes.py \
|
||||
--root /path/to/your/dataset \
|
||||
--out /path/to/output/dir \
|
||||
--split 0.8 # fraction used for training (rest = val)
|
||||
--tag mydata # prefix for output file names
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import uuid
|
||||
import pickle
|
||||
import argparse
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from PIL import Image
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Inline minimal copy of the dataset loader so the script is self-contained.
|
||||
# If you already have my_dataset.py on PYTHONPATH you can remove this block
|
||||
# and simply do: from my_dataset import MyDataset, SampleData, FrameData
|
||||
# ---------------------------------------------------------------------------
|
||||
import glob
|
||||
import open3d as o3d
|
||||
from dataclasses import dataclass, field
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
@dataclass
|
||||
class FrameData:
|
||||
frame_index: int
|
||||
lidar_points: np.ndarray # (N,3) float32
|
||||
image_paths: Dict[str, str]
|
||||
camera_raw: Dict[str, dict] # cam_name -> {projection_matrix, image_size}
|
||||
labels: List[dict]
|
||||
contain_labels: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class SampleData:
|
||||
sample_id: str
|
||||
frames: Dict[int, FrameData]
|
||||
|
||||
|
||||
class MyDataset:
|
||||
cam2filename = {
|
||||
"front_120": "scanofilm_surround_front_120_8M.jpg",
|
||||
"front_left_100": "scanofilm_surround_front_left_100_2M.jpg",
|
||||
"front_right_100": "scanofilm_surround_front_right_100_2M.jpg",
|
||||
"rear_100": "scanofilm_surround_rear_100_2M.jpg",
|
||||
"rear_left_100": "scanofilm_surround_rear_left_100_2M.jpg",
|
||||
"rear_right_100": "scanofilm_surround_rear_right_100_2M.jpg",
|
||||
}
|
||||
color2class = {
|
||||
"#5414ED": "car", "#F6EE64": "pick-up-truck", "#F6A087": "small-truck",
|
||||
"#BC4EF1": "truck", "#4E9AF1": "bus", "#F1A94E": "special-vehicle",
|
||||
"#E1DFDD": "ignore", "#F91906": "tricyclist-withrider",
|
||||
"#FA5F51": "tricyclist-withoutrider", "#B8CB30": "bicycle-withrider",
|
||||
"#E6FD4E": "bicycle-withoutrider", "#876363": "people",
|
||||
"#2CBDF5": "crowd-people", "#C9F52C": "crowd-bicycle",
|
||||
"#DC6788": "crowd-car", "#6EC913": "traffic-cone",
|
||||
"#0DDE69": "plastic-barrier", "#8260D2": "crash-barrels",
|
||||
"#F1D1D1": "warning-triangle", "#FE6DF4": "crowd-traffic-cone",
|
||||
"#D1AA35": "crowd-plastic-barrier", "#3BE8D0": "crowd-crash-barrels",
|
||||
"#2B7567": "crowd-warning-triangle",
|
||||
}
|
||||
|
||||
def __init__(self, root_folder: str):
|
||||
self.root = root_folder
|
||||
self.class2color = {v: k for k, v in self.color2class.items()}
|
||||
self._samples: Dict[str, SampleData] = {}
|
||||
self._load_all_samples()
|
||||
|
||||
@staticmethod
|
||||
def _extract_id(folder_name: str) -> str:
|
||||
return ''.join(c for c in os.path.basename(folder_name) if c.isdigit())
|
||||
|
||||
def _find_sample_folders(self):
|
||||
return [f for f in glob.glob(os.path.join(self.root, "sample*"))
|
||||
if os.path.isdir(f)]
|
||||
|
||||
def _find_paths_in_sample(self, sample_folder, sample_id):
|
||||
pcd_folder = os.path.join(sample_folder, f"pcd_sequence{sample_id}")
|
||||
pcd_files = glob.glob(os.path.join(pcd_folder, "*.pcd")) if os.path.isdir(pcd_folder) else []
|
||||
img_folders = [os.path.join(sample_folder, str(i)) for i in range(len(pcd_files))]
|
||||
return {
|
||||
"pcd_folder": pcd_folder,
|
||||
"img_folders": img_folders,
|
||||
"page_json": os.path.join(sample_folder, f"test{sample_id}.json"),
|
||||
"mark_json": os.path.join(sample_folder, f"test{sample_id}-mark.json"),
|
||||
}
|
||||
|
||||
def _read_lidar_points(self, pcd_path):
|
||||
try:
|
||||
pcd = o3d.io.read_point_cloud(pcd_path, remove_nan_points=True,
|
||||
remove_infinite_points=True, format="auto")
|
||||
return np.asarray(pcd.points, dtype=np.float32)
|
||||
except Exception as e:
|
||||
print(f"Error loading {pcd_path}: {e}")
|
||||
return None
|
||||
|
||||
def _load_all_lidar_frames(self, pcd_folder):
|
||||
out = {}
|
||||
for f in glob.glob(os.path.join(pcd_folder, "*.pcd")):
|
||||
idx = int(os.path.basename(f).split(".")[0].split("n")[-1])
|
||||
pts = self._read_lidar_points(f)
|
||||
if pts is not None:
|
||||
out[idx] = pts
|
||||
return out
|
||||
|
||||
def _load_image_paths(self, img_folders):
|
||||
out = {}
|
||||
for folder in img_folders:
|
||||
if not os.path.isdir(folder):
|
||||
continue
|
||||
idx = int(os.path.basename(folder))
|
||||
paths = {cam: os.path.join(folder, fname)
|
||||
for cam, fname in self.cam2filename.items()
|
||||
if os.path.isfile(os.path.join(folder, fname))}
|
||||
if paths:
|
||||
out[idx] = paths
|
||||
return out
|
||||
|
||||
def _load_camera_raw(self, page_json_path):
|
||||
if not os.path.isfile(page_json_path):
|
||||
return {}
|
||||
with open(page_json_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
try:
|
||||
extend_source = data['data']['files'][0]['extendSources']
|
||||
except (KeyError, IndexError):
|
||||
return {}
|
||||
out = {}
|
||||
for d in extend_source:
|
||||
pe = d.get('pageElement', {})
|
||||
if "sensor" not in pe:
|
||||
continue
|
||||
try:
|
||||
idx = int(d['fileName'].split('.')[0])
|
||||
except Exception:
|
||||
continue
|
||||
cam_name = pe['sensor'].replace("ofilm_surround_", "")
|
||||
for suf in ["_2M", "_8M"]:
|
||||
if cam_name.endswith(suf):
|
||||
cam_name = cam_name[:-len(suf)]
|
||||
break
|
||||
if cam_name not in self.cam2filename:
|
||||
continue
|
||||
matrix = pe.get('mtx', [])
|
||||
image_size = pe.get('imageSize', [])
|
||||
if not matrix or not image_size:
|
||||
continue
|
||||
matrix = np.array(matrix, dtype=np.float64)
|
||||
if matrix.shape != (3, 4):
|
||||
continue
|
||||
out.setdefault(idx, {})[cam_name] = {
|
||||
"projection_matrix": matrix,
|
||||
"image_size": image_size,
|
||||
}
|
||||
return out
|
||||
|
||||
def _extract_text_label(self, json_path):
|
||||
with open(json_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)['data']['list'][0]
|
||||
idx2rate = {k: float(v['accuracy'].replace('%', '')) / 100.0
|
||||
for k, v in data['rate'].items()}
|
||||
max_idx = max(idx2rate, key=idx2rate.get)
|
||||
if idx2rate[max_idx] == 1.0:
|
||||
return data['result'][max_idx]
|
||||
return []
|
||||
|
||||
def _process_text_label(self, json_path):
|
||||
label_list = self._extract_text_label(json_path)
|
||||
if not label_list:
|
||||
return {}
|
||||
out = {}
|
||||
for entry in label_list:
|
||||
out[entry['index']] = self._process_frame_labels(entry['value'])
|
||||
return out
|
||||
|
||||
def _process_frame_labels(self, label_list):
|
||||
labels = []
|
||||
for lb in label_list:
|
||||
nl = {
|
||||
'num': lb['num'],
|
||||
'class': lb['label']['class-name'],
|
||||
'is_moving': not bool(lb['label']['static']),
|
||||
'isolation': bool(lb['label']['isolation']),
|
||||
'color': self.class2color.get(lb['label']['class-name'], ''),
|
||||
'points': [[float(lb['newPoints'][i]['x']),
|
||||
float(lb['newPoints'][i]['y']),
|
||||
float(lb['newPoints'][i]['z'])] for i in range(8)],
|
||||
'center': [float(lb['x']), float(lb['y']), float(lb['z'])],
|
||||
'rotateZ': float(lb['rotateZ']),
|
||||
'dx': float(lb['width']),
|
||||
'dy': float(lb['height']),
|
||||
'dz': float(lb['depth']),
|
||||
}
|
||||
labels.append(nl)
|
||||
return labels
|
||||
|
||||
def _load_one_sample(self, sample_folder):
|
||||
sid = self._extract_id(sample_folder)
|
||||
paths = self._find_paths_in_sample(sample_folder, sid)
|
||||
lidar_dict = self._load_all_lidar_frames(paths["pcd_folder"])
|
||||
img_dict = self._load_image_paths(paths["img_folders"])
|
||||
cam_dict = self._load_camera_raw(paths["page_json"])
|
||||
label_dict = self._process_text_label(paths["mark_json"])
|
||||
if not (len(label_dict) == len(lidar_dict) == len(img_dict) == len(cam_dict)):
|
||||
print(f"Sample {sid}: frame count mismatch, skipping.")
|
||||
return None
|
||||
frames = {idx: FrameData(
|
||||
frame_index=idx,
|
||||
lidar_points=lidar_dict[idx],
|
||||
image_paths=img_dict[idx],
|
||||
camera_raw=cam_dict[idx],
|
||||
labels=label_dict[idx],
|
||||
contain_labels=len(label_dict[idx]) > 0,
|
||||
) for idx in range(len(lidar_dict))}
|
||||
return SampleData(sample_id=sid, frames=frames) if frames else None
|
||||
|
||||
def _load_all_samples(self):
|
||||
for sf in tqdm(self._find_sample_folders(), desc="Loading samples"):
|
||||
s = self._load_one_sample(sf)
|
||||
if s is not None:
|
||||
self._samples[s.sample_id] = s
|
||||
print(f"Loaded {len(self._samples)} samples.")
|
||||
|
||||
@property
|
||||
def sample_ids(self):
|
||||
return list(self._samples.keys())
|
||||
|
||||
def get_sample(self, sid):
|
||||
return self._samples.get(sid)
|
||||
|
||||
def __len__(self):
|
||||
return len(self._samples)
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Helper math utilities
|
||||
# ===========================================================================
|
||||
|
||||
def rq_decompose(P: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Decompose a 3×4 projection matrix P = K @ [R | t]
|
||||
Returns K (3×3 upper-triangular intrinsic), R (3×3 rotation), t (3,).
|
||||
"""
|
||||
M = P[:, :3] # 3×3 left block
|
||||
# RQ decomposition via QR on the transpose
|
||||
Q, R = np.linalg.qr(np.linalg.inv(M).T)
|
||||
R_mat = np.linalg.inv(R.T) # upper triangular -> intrinsic
|
||||
Q_mat = Q.T # rotation
|
||||
|
||||
# Enforce positive diagonal on K
|
||||
signs = np.sign(np.diag(R_mat))
|
||||
signs[signs == 0] = 1.0
|
||||
D = np.diag(signs)
|
||||
R_mat = D @ R_mat
|
||||
Q_mat = D @ Q_mat
|
||||
|
||||
# Recover t
|
||||
t = np.linalg.inv(R_mat) @ P[:, 3]
|
||||
|
||||
return R_mat, Q_mat, t
|
||||
|
||||
|
||||
def rotation_matrix_to_quaternion(R: np.ndarray) -> List[float]:
|
||||
"""Convert 3×3 rotation matrix to quaternion [w, x, y, z]."""
|
||||
trace = R[0, 0] + R[1, 1] + R[2, 2]
|
||||
if trace > 0:
|
||||
s = 0.5 / np.sqrt(trace + 1.0)
|
||||
w = 0.25 / s
|
||||
x = (R[2, 1] - R[1, 2]) * s
|
||||
y = (R[0, 2] - R[2, 0]) * s
|
||||
z = (R[1, 0] - R[0, 1]) * s
|
||||
elif R[0, 0] > R[1, 1] and R[0, 0] > R[2, 2]:
|
||||
s = 2.0 * np.sqrt(1.0 + R[0, 0] - R[1, 1] - R[2, 2])
|
||||
w = (R[2, 1] - R[1, 2]) / s
|
||||
x = 0.25 * s
|
||||
y = (R[0, 1] + R[1, 0]) / s
|
||||
z = (R[0, 2] + R[2, 0]) / s
|
||||
elif R[1, 1] > R[2, 2]:
|
||||
s = 2.0 * np.sqrt(1.0 + R[1, 1] - R[0, 0] - R[2, 2])
|
||||
w = (R[0, 2] - R[2, 0]) / s
|
||||
x = (R[0, 1] + R[1, 0]) / s
|
||||
y = 0.25 * s
|
||||
z = (R[1, 2] + R[2, 1]) / s
|
||||
else:
|
||||
s = 2.0 * np.sqrt(1.0 + R[2, 2] - R[0, 0] - R[1, 1])
|
||||
w = (R[1, 0] - R[0, 1]) / s
|
||||
x = (R[0, 2] + R[2, 0]) / s
|
||||
y = (R[1, 2] + R[2, 1]) / s
|
||||
z = 0.25 * s
|
||||
return [w, x, y, z]
|
||||
|
||||
|
||||
def identity_quaternion() -> List[float]:
|
||||
return [1.0, 0.0, 0.0, 0.0]
|
||||
|
||||
|
||||
def project_3d_to_2d(points_3d: np.ndarray, P: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Project Nx3 LiDAR points to image using 3×4 P.
|
||||
Returns Nx2 pixel coordinates; filters out points behind camera.
|
||||
"""
|
||||
N = points_3d.shape[0]
|
||||
pts_h = np.hstack([points_3d, np.ones((N, 1))]) # Nx4
|
||||
proj = (P @ pts_h.T).T # Nx3
|
||||
valid = proj[:, 2] > 0
|
||||
uv = proj[valid, :2] / proj[valid, 2:3]
|
||||
return uv, valid
|
||||
|
||||
|
||||
def count_points_in_box(points: np.ndarray, cx, cy, cz, dx, dy, dz, yaw) -> int:
|
||||
"""Count LiDAR points inside an axis-aligned-then-rotated bounding box."""
|
||||
cos_y, sin_y = np.cos(-yaw), np.sin(-yaw)
|
||||
R_inv = np.array([[cos_y, -sin_y, 0],
|
||||
[sin_y, cos_y, 0],
|
||||
[0, 0, 1]], dtype=np.float32)
|
||||
pts_centered = points - np.array([cx, cy, cz], dtype=np.float32)
|
||||
pts_rot = (R_inv @ pts_centered.T).T
|
||||
mask = ((np.abs(pts_rot[:, 0]) <= dx / 2) &
|
||||
(np.abs(pts_rot[:, 1]) <= dy / 2) &
|
||||
(np.abs(pts_rot[:, 2]) <= dz / 2))
|
||||
return int(mask.sum())
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# NuScenes camera name mapping
|
||||
# ---------------------------------------------------------------------------
|
||||
CAM_NAME_MAP = {
|
||||
"front_120": "CAM_FRONT",
|
||||
"front_left_100": "CAM_FRONT_LEFT",
|
||||
"front_right_100": "CAM_FRONT_RIGHT",
|
||||
"rear_100": "CAM_BACK",
|
||||
"rear_left_100": "CAM_BACK_LEFT",
|
||||
"rear_right_100": "CAM_BACK_RIGHT",
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Class mapping: private dataset labels -> NuScenes labels
|
||||
# ---------------------------------------------------------------------------
|
||||
PRIVATE2NUSCENES = {
|
||||
"car": "car",
|
||||
"pick-up-truck": "car",
|
||||
"small-truck": "truck",
|
||||
"truck": "truck",
|
||||
"bus": "bus",
|
||||
"ignore": "car",
|
||||
"special-vehicle": "construction_vehicle",
|
||||
"tricyclist-withrider": "bicycle",
|
||||
"tricyclist-withoutrider":"bicycle",
|
||||
"bicycle-withrider": "bicycle",
|
||||
"bicycle-withoutrider": "bicycle",
|
||||
"people": "pedestrian",
|
||||
"crowd-people": "pedestrian",
|
||||
"crowd-bicycle": "bicycle",
|
||||
"crowd-car": "car",
|
||||
"traffic-cone": "traffic_cone",
|
||||
"plastic-barrier": "barrier",
|
||||
"crash-barrels": "barrier",
|
||||
"warning-triangle": "barrier",
|
||||
"crowd-traffic-cone": "barrier",
|
||||
"crowd-plastic-barrier": "barrier",
|
||||
"crowd-crash-barrels": "barrier",
|
||||
"crowd-warning-triangle": "barrier",
|
||||
}
|
||||
|
||||
# Official NuScenes detection classes (in canonical order)
|
||||
DETECTION_CLASSES = [
|
||||
"car", "truck", "bus", "construction_vehicle", "motorcycle",
|
||||
"bicycle", "pedestrian", "traffic_cone", "barrier", "trailer",
|
||||
]
|
||||
CLASS2ID = {c: i for i, c in enumerate(DETECTION_CLASSES)}
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Core conversion
|
||||
# ===========================================================================
|
||||
|
||||
def frame_to_info(frame: FrameData, sample_id: str, out_lidar_dir: str,
|
||||
out_img_dir: str, rel_lidar_root: str, rel_img_root: str) -> dict:
|
||||
"""
|
||||
Convert one FrameData into a NuScenes-style info dict.
|
||||
|
||||
Since we have no vehicle poses, we treat lidar = ego = global (identity).
|
||||
"""
|
||||
token = str(uuid.uuid4())
|
||||
|
||||
# ---- Save point cloud as .bin (float32 x,y,z) -------------------------
|
||||
os.makedirs(out_lidar_dir, exist_ok=True)
|
||||
bin_name = f"{sample_id}_frame{frame.frame_index:04d}.bin"
|
||||
bin_path = os.path.join(out_lidar_dir, bin_name)
|
||||
frame.lidar_points.astype(np.float32).tofile(bin_path)
|
||||
lidar_path_rel = os.path.join(rel_lidar_root, bin_name)
|
||||
|
||||
# ---- Identity ego/global poses ----------------------------------------
|
||||
zero3 = [0.0, 0.0, 0.0]
|
||||
id_quat = identity_quaternion()
|
||||
|
||||
# ---- Camera info -------------------------------------------------------
|
||||
cams_info = {}
|
||||
target_W, target_H = 1920, 1280
|
||||
|
||||
for cam_key, nus_cam in CAM_NAME_MAP.items():
|
||||
if cam_key not in frame.camera_raw:
|
||||
continue
|
||||
cam_data = frame.camera_raw[cam_key]
|
||||
P = cam_data["projection_matrix"] # 3×4
|
||||
ori_W, ori_H = cam_data["image_size"]
|
||||
|
||||
# Decompose P -> K (3×3 intrinsic), R_cam_from_lidar, t_cam_from_lidar
|
||||
K, R_c2l_inv, t_in_cam = rq_decompose(P)
|
||||
# R_c2l_inv is R such that x_cam = R @ x_lidar + t
|
||||
R_lidar2cam = R_c2l_inv # camera_from_lidar rotation
|
||||
t_lidar2cam = t_in_cam # translation in camera coords
|
||||
|
||||
# sensor2lidar = inverse of lidar2camera
|
||||
R_cam2lidar = R_lidar2cam.T
|
||||
t_cam2lidar = -R_cam2lidar @ t_lidar2cam
|
||||
|
||||
# adjust intrinsic matrix
|
||||
sx = target_W / ori_W
|
||||
sy = target_H / ori_H
|
||||
K_adjusted = K.copy()
|
||||
K_adjusted[0, 0] *= sx
|
||||
K_adjusted[1, 1] *= sy
|
||||
K_adjusted[0, 2] *= sx
|
||||
K_adjusted[1, 2] *= sy
|
||||
|
||||
# Copy image to output directory
|
||||
src_img = frame.image_paths[cam_key]
|
||||
cam_img_dir = os.path.join(out_img_dir, nus_cam)
|
||||
os.makedirs(cam_img_dir, exist_ok=True)
|
||||
img_name = f"{sample_id}_frame{frame.frame_index:04d}.jpg"
|
||||
dst_img = os.path.join(cam_img_dir, img_name)
|
||||
|
||||
img = Image.open(src_img)
|
||||
img_resized = img.resize((target_W, target_H), Image.LANCZOS)
|
||||
img_resized.save(dst_img)
|
||||
img_path_rel = os.path.join(rel_img_root, nus_cam, img_name)
|
||||
|
||||
cam_token = str(uuid.uuid4())
|
||||
cams_info[nus_cam] = {
|
||||
"data_path": img_path_rel,
|
||||
"type": nus_cam,
|
||||
"sample_data_token": cam_token,
|
||||
# camera -> ego (treat camera as fixed to lidar frame via R_cam2lidar)
|
||||
"sensor2ego_translation": t_cam2lidar.tolist(),
|
||||
"sensor2ego_rotation": rotation_matrix_to_quaternion(R_cam2lidar),
|
||||
# ego -> global (identity)
|
||||
"ego2global_translation": zero3,
|
||||
"ego2global_rotation": id_quat,
|
||||
# lidar -> camera extrinsic (for projection)
|
||||
"sensor2lidar_translation": t_cam2lidar.tolist(),
|
||||
"sensor2lidar_rotation": rotation_matrix_to_quaternion(R_cam2lidar),
|
||||
# intrinsic
|
||||
"cam_intrinsic": K_adjusted.tolist(),
|
||||
"width": target_W,
|
||||
"height": target_H,
|
||||
"timestamp": frame.frame_index * 100000, # synthetic
|
||||
}
|
||||
|
||||
# ---- Annotations -------------------------------------------------------
|
||||
instances = []
|
||||
for lb in frame.labels:
|
||||
raw_cls = lb.get("class", "")
|
||||
cls = PRIVATE2NUSCENES.get(raw_cls) # remap to NuScenes name
|
||||
if cls is None or cls not in CLASS2ID:
|
||||
continue
|
||||
cx, cy, cz = lb["center"]
|
||||
dx, dy, dz = lb["dx"], lb["dy"], lb["dz"]
|
||||
yaw = lb["rotateZ"]
|
||||
n_pts = count_points_in_box(frame.lidar_points, cx, cy, cz, dx, dy, dz, yaw)
|
||||
|
||||
instances.append({
|
||||
"bbox_3d": [cx, cy, cz, dx, dy, dz, yaw],
|
||||
"bbox_label_3d": CLASS2ID[cls],
|
||||
"bbox": [0.0, 0.0, 1.0, 1.0], # no 2D GT available
|
||||
"bbox_label": CLASS2ID[cls],
|
||||
"velocity": [float("nan"), float("nan")],
|
||||
"num_lidar_pts": n_pts,
|
||||
"bbox_3d_isvalid": n_pts > 0,
|
||||
})
|
||||
|
||||
info = {
|
||||
"lidar_points": {
|
||||
"lidar_path": lidar_path_rel,
|
||||
"num_pts_feats": 3, # X, Y, Z only
|
||||
},
|
||||
"token": token,
|
||||
"sweeps": [], # no sweep data available
|
||||
"cams": cams_info,
|
||||
# lidar == ego (identity)
|
||||
"lidar2ego_translation": zero3,
|
||||
"lidar2ego_rotation": id_quat,
|
||||
# ego == global (identity)
|
||||
"ego2global_translation": zero3,
|
||||
"ego2global_rotation": id_quat,
|
||||
"timestamp": frame.frame_index * 100000,
|
||||
# annotations
|
||||
"instances": instances,
|
||||
}
|
||||
return info
|
||||
|
||||
|
||||
def build_coco_json(infos: List[dict]) -> dict:
|
||||
"""Build a COCO-style mono3d JSON from NuScenes info dicts."""
|
||||
categories = [{"id": i, "name": c} for i, c in enumerate(DETECTION_CLASSES)]
|
||||
images = []
|
||||
annotations = []
|
||||
ann_id = 0
|
||||
|
||||
for info in infos:
|
||||
frame_token = info["token"]
|
||||
for nus_cam, cam_info in info["cams"].items():
|
||||
img_token = cam_info["sample_data_token"]
|
||||
W = cam_info["width"]
|
||||
H = cam_info["height"]
|
||||
K = np.array(cam_info["cam_intrinsic"]) # 3×3
|
||||
|
||||
img_entry = {
|
||||
"file_name": cam_info["data_path"],
|
||||
"id": img_token,
|
||||
"token": frame_token,
|
||||
"cam2ego_rotation": cam_info["sensor2ego_rotation"],
|
||||
"cam2ego_translation": cam_info["sensor2ego_translation"],
|
||||
"ego2global_rotation": info["ego2global_rotation"],
|
||||
"ego2global_translation": info["ego2global_translation"],
|
||||
"cam_intrinsic": cam_info["cam_intrinsic"],
|
||||
"width": W,
|
||||
"height": H,
|
||||
}
|
||||
images.append(img_entry)
|
||||
|
||||
# Build sensor2lidar matrix for projecting boxes
|
||||
t_c2l = np.array(cam_info["sensor2lidar_translation"])
|
||||
quat = cam_info["sensor2lidar_rotation"] # [w,x,y,z]
|
||||
w, x, y, z = quat
|
||||
R_c2l = np.array([
|
||||
[1-2*(y*y+z*z), 2*(x*y-z*w), 2*(x*z+y*w)],
|
||||
[2*(x*y+z*w), 1-2*(x*x+z*z), 2*(y*z-x*w)],
|
||||
[2*(x*z-y*w), 2*(y*z+x*w), 1-2*(x*x+y*y)],
|
||||
])
|
||||
R_l2c = R_c2l.T
|
||||
t_l2c = -R_l2c @ t_c2l
|
||||
|
||||
# Iterate over the valid "instances" key generated by frame_to_info
|
||||
for inst in info.get("instances", []):
|
||||
cls_id = inst["bbox_label_3d"]
|
||||
if cls_id < 0 or cls_id >= len(DETECTION_CLASSES):
|
||||
continue
|
||||
|
||||
cls = DETECTION_CLASSES[cls_id]
|
||||
cx_l, cy_l, cz_l, dx, dy, dz, yaw_l = inst["bbox_3d"]
|
||||
|
||||
# Transform center to camera frame
|
||||
center_l = np.array([cx_l, cy_l, cz_l])
|
||||
center_c = R_l2c @ center_l + t_l2c
|
||||
depth = center_c[2]
|
||||
if depth <= 0:
|
||||
continue # box behind camera
|
||||
|
||||
# Project 3D center to 2D
|
||||
center_2d = K @ center_c
|
||||
u = center_2d[0] / center_2d[2]
|
||||
v = center_2d[1] / center_2d[2]
|
||||
|
||||
# Yaw in camera frame (approx: only rotate around Z in lidar)
|
||||
yaw_cam = yaw_l
|
||||
|
||||
# 3D box annotation: [cx_cam, cy_cam, cz_cam, dx, dy, dz, yaw_cam]
|
||||
bbox_cam3d = [float(center_c[0]), float(center_c[1]), float(center_c[2]),
|
||||
float(dx), float(dy), float(dz), float(yaw_cam)]
|
||||
|
||||
# Project 8 corners to get 2D bbox
|
||||
corners_l = _box_corners(cx_l, cy_l, cz_l, dx, dy, dz, yaw_l)
|
||||
corners_c = (R_l2c @ corners_l.T).T + t_l2c
|
||||
corners_c = corners_c[corners_c[:, 2] > 0]
|
||||
if len(corners_c) == 0:
|
||||
continue
|
||||
proj_h = (K @ corners_c.T).T
|
||||
uvs = proj_h[:, :2] / proj_h[:, 2:3]
|
||||
x1, y1 = uvs[:, 0].min(), uvs[:, 1].min()
|
||||
x2, y2 = uvs[:, 0].max(), uvs[:, 1].max()
|
||||
x1 = max(0, min(x1, W)); x2 = max(0, min(x2, W))
|
||||
y1 = max(0, min(y1, H)); y2 = max(0, min(y2, H))
|
||||
bw, bh = x2 - x1, y2 - y1
|
||||
if bw <= 0 or bh <= 0:
|
||||
continue
|
||||
|
||||
# Extract velocity from the instance dict
|
||||
vx, vy = inst.get("velocity", [float("nan"), float("nan")])
|
||||
|
||||
ann = {
|
||||
"file_name": cam_info["data_path"],
|
||||
"image_id": img_token,
|
||||
"area": float(bw * bh),
|
||||
"category_name": cls,
|
||||
"category_id": cls_id,
|
||||
"bbox": [float(x1), float(y1), float(bw), float(bh)],
|
||||
"iscrowd": 0,
|
||||
"bbox_cam3d": bbox_cam3d,
|
||||
"velo_cam3d": [vx, vy],
|
||||
"center2d": [float(u), float(v), float(depth)],
|
||||
"attribute_name": "",
|
||||
"attribute_id": -1,
|
||||
"id": ann_id,
|
||||
}
|
||||
annotations.append(ann)
|
||||
ann_id += 1
|
||||
|
||||
return {"categories": categories, "images": images, "annotations": annotations}
|
||||
|
||||
|
||||
def _box_corners(cx, cy, cz, dx, dy, dz, yaw) -> np.ndarray:
|
||||
"""Return 8 corners of a 3D box as (8,3) array."""
|
||||
hx, hy, hz = dx / 2, dy / 2, dz / 2
|
||||
corners = np.array([
|
||||
[ hx, hy, hz], [ hx, hy, -hz], [ hx, -hy, hz], [ hx, -hy, -hz],
|
||||
[-hx, hy, hz], [-hx, hy, -hz], [-hx, -hy, hz], [-hx, -hy, -hz],
|
||||
], dtype=np.float32)
|
||||
c, s = np.cos(yaw), np.sin(yaw)
|
||||
Rz = np.array([[c, -s, 0], [s, c, 0], [0, 0, 1]], dtype=np.float32)
|
||||
corners = (Rz @ corners.T).T + np.array([cx, cy, cz], dtype=np.float32)
|
||||
return corners
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Main
|
||||
# ===========================================================================
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Convert private dataset to NuScenes-style PKL/JSON.")
|
||||
parser.add_argument("--root", default="sample", help="Root folder of your dataset")
|
||||
parser.add_argument("--out", default="my_dataset", help="Output directory")
|
||||
parser.add_argument("--split", type=float, default=0.8, help="Train fraction (0-1)")
|
||||
parser.add_argument("--tag", default="mydata", help="Output file prefix")
|
||||
args = parser.parse_args()
|
||||
|
||||
out_root = Path(args.out)
|
||||
lidar_dir = out_root / "samples" / "LIDAR_TOP"
|
||||
img_dir = out_root / "samples"
|
||||
rel_lidar = "samples/LIDAR_TOP"
|
||||
rel_img = "samples"
|
||||
|
||||
print("Loading dataset ...")
|
||||
ds = MyDataset(args.root)
|
||||
|
||||
# Collect all (sample_id, frame_index) pairs
|
||||
all_frames = []
|
||||
for sid in sorted(ds.sample_ids):
|
||||
sample = ds.get_sample(sid)
|
||||
for fidx in sorted(sample.frames.keys()):
|
||||
all_frames.append((sid, fidx, sample.frames[fidx]))
|
||||
|
||||
print(f"Total frames: {len(all_frames)}")
|
||||
n_train = int(len(all_frames) * args.split)
|
||||
train_frames = all_frames[:n_train]
|
||||
val_frames = all_frames[n_train:]
|
||||
|
||||
def convert_split(frames, split_name):
|
||||
infos = []
|
||||
for sid, fidx, frame in tqdm(frames, desc=f"Converting {split_name}"):
|
||||
info = frame_to_info(
|
||||
frame, sid,
|
||||
out_lidar_dir=str(lidar_dir),
|
||||
out_img_dir=str(img_dir),
|
||||
rel_lidar_root=rel_lidar,
|
||||
rel_img_root=rel_img,
|
||||
)
|
||||
infos.append(info)
|
||||
|
||||
pkl_path = out_root / f"{args.tag}_infos_{split_name}.pkl"
|
||||
json_path = out_root / f"{args.tag}_infos_{split_name}_mono3d.coco.json"
|
||||
|
||||
with open(pkl_path, "wb") as f:
|
||||
pickle.dump({
|
||||
"metainfo": {
|
||||
"version": "custom-v1.0",
|
||||
"classes": DETECTION_CLASSES,
|
||||
},
|
||||
"data_list": infos,
|
||||
}, f)
|
||||
print(f"Saved {pkl_path} ({len(infos)} frames)")
|
||||
|
||||
coco = build_coco_json(infos)
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(coco, f, ensure_ascii=False, indent=2)
|
||||
print(f"Saved {json_path} ({len(coco['images'])} images, {len(coco['annotations'])} anns)")
|
||||
|
||||
convert_split(train_frames, "train")
|
||||
convert_split(val_frames, "val")
|
||||
|
||||
# Write class list for reference
|
||||
classes_path = out_root / f"{args.tag}_classes.txt"
|
||||
classes_path.write_text("\n".join(DETECTION_CLASSES))
|
||||
print(f"\nDone. Output structure:\n {out_root}/")
|
||||
print(f" ├── samples/LIDAR_TOP/*.bin")
|
||||
print(f" ├── samples/CAM_FRONT/*.jpg (and other cameras)")
|
||||
print(f" ├── {args.tag}_infos_train.pkl")
|
||||
print(f" ├── {args.tag}_infos_val.pkl")
|
||||
print(f" ├── {args.tag}_infos_train_mono3d.coco.json")
|
||||
print(f" └── {args.tag}_infos_val_mono3d.coco.json")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user