Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 161 additions & 0 deletions extract_information_v1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
import os
import torch
import numpy as np
import pandas as pd
from tkinter import filedialog, Tk
import json
import matplotlib.pyplot as plt
from PIL import Image
import gc

# Importaciones específicas de VGGT
from vggt.models.vggt import VGGT
from vggt.utils.load_fn import load_and_preprocess_images
from vggt.utils.pose_enc import pose_encoding_to_extri_intri

# --- CONFIGURACIÓN ---
BATCH_SIZE = 10 # Ajustado para evitar OOM
# ---------------------

def select_folder(prompt):
root = Tk()
root.withdraw()
return filedialog.askdirectory(title=prompt)

def extract_camera_parameters(extrinsic, intrinsic):
"""
Descompone las matrices de VGGT en parámetros interpretables.
Convención: Extrínseca [R|t] transforma de Mundo -> Cámara.
"""
# 1. Intrínsecos
fx = intrinsic[0, 0]
fy = intrinsic[1, 1]
cx = intrinsic[0, 2]
cy = intrinsic[1, 2]
focal_length = (fx + fy) / 2.0

# 2. Extrínsecos
R_cw = extrinsic[:3, :3] # Rotación Mundo -> Cámara
t_cw = extrinsic[:3, 3] # Translación Mundo -> Cámara

# Calcular Pose (Cámara -> Mundo)
# Posición de la cámara C = -R_cw^T * t_cw
R_wc = R_cw.T
camera_center = -np.dot(R_wc, t_cw)

return {
"focal_length": float(focal_length),
"principal_point": [float(cx), float(cy)],
"intrinsic_matrix": intrinsic.tolist(),
"camera_position": camera_center.tolist(),
"rotation_matrix_wc": R_wc.tolist() # Guardamos la rotación de la pose
}

def save_depth_map(depth_tensor, output_path):
depth = depth_tensor + 1e-6
inverse_depth = 1.0 / depth
vmax = np.percentile(inverse_depth, 95)
vmin = np.percentile(inverse_depth, 5)
inverse_depth_normalized = (inverse_depth - vmin) / (vmax - vmin + 1e-8)
inverse_depth_normalized = np.clip(inverse_depth_normalized, 0, 1)

cmap = plt.get_cmap("turbo")
color_depth = (cmap(inverse_depth_normalized)[..., :3] * 255).astype(np.uint8)
Image.fromarray(color_depth).save(output_path, format="JPEG", quality=85)

def process_batch(model, batch_files, input_folder, output_folder, depth_out_dir, device, dtype):
image_paths = [os.path.join(input_folder, f) for f in batch_files]

try:
images_tensor = load_and_preprocess_images(image_paths).to(device)
if images_tensor.ndim == 4:
images_tensor = images_tensor.unsqueeze(0)
except Exception as e:
print(f"Error cargando batch: {e}")
return []

with torch.no_grad():
# Usando la sintaxis moderna de autocast para evitar warnings
with torch.amp.autocast('cuda', dtype=dtype):
predictions = model(images_tensor)

pose_enc = predictions["pose_enc"]
img_size_hw = images_tensor.shape[-2:]
extrinsics, intrinsics = pose_encoding_to_extri_intri(pose_enc, img_size_hw)

extrinsics = extrinsics.squeeze(0).cpu().numpy().astype(np.float64) # Mayor precisión
intrinsics = intrinsics.squeeze(0).cpu().numpy().astype(np.float64)

depths_np = None
if "depth" in predictions:
depths_tensor = predictions["depth"]
depths_np = depths_tensor.squeeze(0).squeeze(-1).cpu().numpy()

batch_records = []

for i, img_name in enumerate(batch_files):
params = extract_camera_parameters(extrinsics[i], intrinsics[i])

depth_filename = ""
if depths_np is not None:
depth_filename = f"depth_{os.path.splitext(img_name)[0]}.jpeg"
depth_path = os.path.join(depth_out_dir, depth_filename)
save_depth_map(depths_np[i], depth_path)

record = {
"image_name": img_name,
"depth_map_file": depth_filename,
"f": params["focal_length"],
"cx": params["principal_point"][0],
"cy": params["principal_point"][1],
"tx": params["camera_position"][0],
"ty": params["camera_position"][1],
"tz": params["camera_position"][2],
"intrinsic_matrix": json.dumps(params["intrinsic_matrix"]),
"rotation_matrix_wc": json.dumps(params["rotation_matrix_wc"])
}
batch_records.append(record)

del images_tensor, predictions, pose_enc, extrinsics, intrinsics
if depths_np is not None: del depths_tensor # Corrección de variable
torch.cuda.empty_cache()

return batch_records

def process_images_vx(input_folder, output_folder):
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16

print(f"Cargando modelo en {device}...")
model = VGGT.from_pretrained("facebook/VGGT-1B").to(device)
model.eval()

image_files = sorted([f for f in os.listdir(input_folder) if f.lower().endswith(('png', 'jpg', 'jpeg'))])
total_images = len(image_files)
print(f"Encontradas {total_images} imágenes.")

depth_out_dir = os.path.join(output_folder, "depth_maps")
os.makedirs(depth_out_dir, exist_ok=True)

all_records = []

for i in range(0, total_images, BATCH_SIZE):
batch_files = image_files[i : i + BATCH_SIZE]
print(f"Procesando {i}/{total_images}...")
records = process_batch(model, batch_files, input_folder, output_folder, depth_out_dir, device, dtype)
all_records.extend(records)
gc.collect()

df = pd.DataFrame(all_records)
csv_path = os.path.join(output_folder, "vggt_camera_data.csv")
df.to_csv(csv_path, index=False)
print(f"Hecho. CSV en: {csv_path}")

if __name__ == "__main__":
print("Selecciona carpeta de entrada...")
in_dir = select_folder("Entrada")
if in_dir:
print("Selecciona carpeta de salida...")
out_dir = select_folder("Salida")
if out_dir:
process_images_vx(in_dir, out_dir)
185 changes: 185 additions & 0 deletions extract_information_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
import os
import torch
import numpy as np
import pandas as pd
from tkinter import filedialog, Tk
import sys

# Importaciones de VGGT (asegúrate de ejecutar esto desde la raíz del proyecto vggt)
try:
from vggt.models.vggt import VGGT
from vggt.utils.load_fn import load_and_preprocess_images
from vggt.utils.pose_enc import pose_encoding_to_extri_intri
except ImportError:
print("Error: No se encuentran los módulos de VGGT. Asegúrate de ejecutar este script desde la carpeta raíz 'vggt-data-augmentation'.")
sys.exit(1)

def select_folder(prompt):
"""Abre una ventana para seleccionar carpeta"""
root = Tk()
root.withdraw() # Ocultar la ventana principal de Tkinter
root.attributes('-topmost', True) # Forzar la ventana al frente
folder_path = filedialog.askdirectory(title=prompt)
root.destroy()
return folder_path

def extract_information_vx():
# --- 1. Selección de Carpetas ---
print("Por favor, selecciona la carpeta con las IMÁGENES de entrada...")
image_folder = select_folder("Selecciona la carpeta con las IMÁGENES")

if not image_folder:
print("No se seleccionó carpeta de entrada. Cancelando.")
return

print("Por favor, selecciona la carpeta donde guardar el CSV de salida...")
output_folder = select_folder("Selecciona la carpeta de SALIDA (para guardar el CSV)")

if not output_folder:
print("No se seleccionó carpeta de salida. Cancelando.")
return

output_csv = os.path.join(output_folder, "camera_data_vx.csv")

# --- 2. Configuración del Modelo ---
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16 if (torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8) else torch.float16
print(f"Usando dispositivo: {device}")

print("Cargando modelo VGGT...")
try:
model = VGGT.from_pretrained("facebook/VGGT-1B").to(device)
except Exception as e:
print(f"Nota: Carga automática falló ({e}), intentando carga manual...")
model = VGGT()
_URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
model.load_state_dict(torch.hub.load_state_dict_from_url(_URL))
model = model.to(device)
model.eval()

# --- 3. Procesamiento ---
valid_exts = ('.png', '.jpg', '.jpeg')
image_files = sorted([os.path.join(image_folder, f) for f in os.listdir(image_folder)
if f.lower().endswith(valid_exts)])

if not image_files:
print(f"Error: No se encontraron imágenes válidas en {image_folder}")
return

print(f"Procesando {len(image_files)} imágenes...")

# Cargar imágenes
images_tensor = load_and_preprocess_images(image_files).to(device)

# Inferencia
with torch.no_grad():
with torch.cuda.amp.autocast(dtype=dtype):
if images_tensor.ndim == 4:
images_input = images_tensor.unsqueeze(0)
else:
images_input = images_tensor

predictions = model(images_input)
pose_enc = predictions["pose_enc"]

# Descodificar poses
extrinsics, intrinsics = pose_encoding_to_extri_intri(pose_enc, images_tensor.shape[-2:])
extrinsics = extrinsics.squeeze(0).float().cpu().numpy()
intrinsics = intrinsics.squeeze(0).float().cpu().numpy()

# --- 4. Guardar Datos ---
data_records = []
for i, img_path in enumerate(image_files):
K = intrinsics[i]
E = extrinsics[i] # [R|t] (Cámara <- Mundo)

R = E[:3, :3]
t = E[:3, 3]

# Cálculo de posición real en el mundo: C = -R^T * t
camera_center_world = -np.dot(R.T, t)

# Rotación para visualización (Mundo <- Cámara)
# Esta es la orientación de la cámara en el mundo
R_wc = R.T

# Calcular Ángulos de Euler (Yaw, Pitch, Roll) a partir de R_wc
# Asumimos convención XYZ o similar. Para cámaras suele ser útil Pitch, Yaw, Roll.
# Una implementación robusta de rotación a Euler (ZYX convention: Z=Yaw, Y=Pitch, X=Roll)
import math
sy = math.sqrt(R_wc[0, 0] * R_wc[0, 0] + R_wc[1, 0] * R_wc[1, 0])
singular = sy < 1e-6
if not singular:
x_rot = math.atan2(R_wc[2, 1], R_wc[2, 2])
y_rot = math.atan2(-R_wc[2, 0], sy)
z_rot = math.atan2(R_wc[1, 0], R_wc[0, 0])
else:
x_rot = math.atan2(-R_wc[1, 2], R_wc[1, 1])
y_rot = math.atan2(-R_wc[2, 0], sy)
z_rot = 0

# Convertir a grados
roll = np.degrees(x_rot)
pitch = np.degrees(y_rot)
yaw = np.degrees(z_rot)

# Altura relativa
# Para datos de UAV/Drones (VGGT suele alinear Z con la vista),
# si la cámara mira hacia abajo (Z+), la altura/altitud varía en el eje Z.
# Asumimos que moverse en -Z es subir.
height_rel = -camera_center_world[2]

# --- 5. Guardar Mapa de Profundidad ---
depth_rel_path = ""
if "depth" in predictions:
# Obtener el mapa de profundidad para este índice
# depth_tensor shape: B, S, H, W, 1
# predictions["depth"][0, i, :, :, 0]
d_map = predictions["depth"][0, i, :, :, 0].float().cpu().numpy()

# Normalizar para visualización (Inverse Depth suele verse mejor)
d_map = d_map + 1e-6
inv_depth = 1.0 / d_map
vmax = np.percentile(inv_depth, 95)
vmin = np.percentile(inv_depth, 5)
norm_depth = (inv_depth - vmin) / (vmax - vmin + 1e-8)
norm_depth = np.clip(norm_depth, 0, 1)

# Colorear con mapa de calor (Turbo es bueno para profundidad)
import matplotlib.pyplot as plt
cmap = plt.get_cmap("turbo")
color_depth = (cmap(norm_depth)[..., :3] * 255).astype(np.uint8)

# Guardar imagen
from PIL import Image
depth_filename = f"depth_{os.path.basename(img_path).split('.')[0]}.png"
depth_save_path = os.path.join(output_folder, "depth_maps", depth_filename)
os.makedirs(os.path.dirname(depth_save_path), exist_ok=True)

Image.fromarray(color_depth).save(depth_save_path)
depth_rel_path = depth_filename

data_records.append({
"image_name": os.path.basename(img_path).split('.')[0], # Nombre sin extensión para referencia más limpia
"full_path": img_path,
"depth_map_path": depth_rel_path,
"focal_x": K[0, 0],
"focal_y": K[1, 1],
"principal_x": K[0, 2],
"principal_y": K[1, 2],
"pos_x": camera_center_world[0],
"pos_y": camera_center_world[1],
"pos_z": camera_center_world[2],
"height": height_rel,
"roll": roll,
"pitch": pitch,
"yaw": yaw,
"R_world_flat": R_wc.flatten().tolist()
})

df = pd.DataFrame(data_records)
df.to_csv(output_csv, index=False)
print(f"¡Éxito! Datos guardados en: {output_csv}")

if __name__ == "__main__":
extract_information_vx()
12 changes: 12 additions & 0 deletions requirements_da.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# PyTorch with native CUDA 13.0 support
--index-url https://download.pytorch.org/whl/cu130
torch
torchvision
torchaudio

# Core dependencies - updated for Python 3.13
numpy>=2.0.0
Pillow
huggingface_hub
einops
safetensors
Loading