|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""" |
|
This file is the entry point for launching experiments with Implicitron. |
|
|
|
Launch Training |
|
--------------- |
|
Experiment config .yaml files are located in the |
|
`projects/implicitron_trainer/configs` folder. To launch an experiment, |
|
specify the name of the file. Specific config values can also be overridden |
|
from the command line, for example: |
|
|
|
``` |
|
./experiment.py --config-name base_config.yaml override.param.one=42 override.param.two=84 |
|
``` |
|
|
|
Main functions |
|
--------------- |
|
- The Experiment class defines `run` which creates the model, optimizer, and other |
|
objects used in training, then starts TrainingLoop's `run` function. |
|
- TrainingLoop takes care of the actual training logic: forward and backward passes, |
|
evaluation and testing, as well as model checkpointing, visualization, and metric |
|
printing. |
|
|
|
Outputs |
|
-------- |
|
The outputs of the experiment are saved and logged in multiple ways: |
|
- Checkpoints: |
|
Model, optimizer and stats are stored in the directory |
|
named by the `exp_dir` key from the config file / CLI parameters. |
|
- Stats |
|
Stats are logged and plotted to the file "train_stats.pdf" in the |
|
same directory. The stats are also saved as part of the checkpoint file. |
|
- Visualizations |
|
Predictions are plotted to a visdom server running at the |
|
port specified by the `visdom_server` and `visdom_port` keys in the |
|
config file. |
|
|
|
""" |
|
import logging |
|
import os |
|
import warnings |
|
|
|
from dataclasses import field |
|
|
|
import hydra |
|
|
|
import torch |
|
from accelerate import Accelerator |
|
from omegaconf import DictConfig, OmegaConf |
|
from packaging import version |
|
|
|
from pytorch3d.implicitron.dataset.data_source import ( |
|
DataSourceBase, |
|
ImplicitronDataSource, |
|
) |
|
from pytorch3d.implicitron.models.base_model import ImplicitronModelBase |
|
|
|
from pytorch3d.implicitron.models.renderer.multipass_ea import ( |
|
MultiPassEmissionAbsorptionRenderer, |
|
) |
|
from pytorch3d.implicitron.models.renderer.ray_sampler import AdaptiveRaySampler |
|
from pytorch3d.implicitron.tools.config import ( |
|
Configurable, |
|
expand_args_fields, |
|
remove_unused_components, |
|
run_auto_creation, |
|
) |
|
|
|
from .impl.model_factory import ModelFactoryBase |
|
from .impl.optimizer_factory import OptimizerFactoryBase |
|
from .impl.training_loop import TrainingLoopBase |
|
from .impl.utils import seed_all_random_engines |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
_RUN = hydra.types.RunMode.RUN |
|
|
|
if version.parse(hydra.__version__) < version.Version("1.1"): |
|
raise ValueError( |
|
f"Hydra version {hydra.__version__} is too old." |
|
" (Implicitron requires version 1.1 or later.)" |
|
) |
|
|
|
try: |
|
|
|
import pytorch3d.implicitron.fair_cluster.slurm |
|
except ModuleNotFoundError: |
|
pass |
|
|
|
no_accelerate = os.environ.get("PYTORCH3D_NO_ACCELERATE") is not None |
|
|
|
|
|
class Experiment(Configurable): |
|
""" |
|
This class is at the top level of Implicitron's config hierarchy. Its |
|
members are high-level components necessary for training an implicit rende- |
|
ring network. |
|
|
|
Members: |
|
data_source: An object that produces datasets and dataloaders. |
|
model_factory: An object that produces an implicit rendering model as |
|
well as its corresponding Stats object. |
|
optimizer_factory: An object that produces the optimizer and lr |
|
scheduler. |
|
training_loop: An object that runs training given the outputs produced |
|
by the data_source, model_factory and optimizer_factory. |
|
seed: A random seed to ensure reproducibility. |
|
detect_anomaly: Whether torch.autograd should detect anomalies. Useful |
|
for debugging, but might slow down the training. |
|
exp_dir: Root experimentation directory. Checkpoints and training stats |
|
will be saved here. |
|
""" |
|
|
|
data_source: DataSourceBase |
|
data_source_class_type: str = "ImplicitronDataSource" |
|
model_factory: ModelFactoryBase |
|
model_factory_class_type: str = "ImplicitronModelFactory" |
|
optimizer_factory: OptimizerFactoryBase |
|
optimizer_factory_class_type: str = "ImplicitronOptimizerFactory" |
|
training_loop: TrainingLoopBase |
|
training_loop_class_type: str = "ImplicitronTrainingLoop" |
|
|
|
seed: int = 42 |
|
detect_anomaly: bool = False |
|
exp_dir: str = "./data/default_experiment/" |
|
|
|
hydra: dict = field( |
|
default_factory=lambda: { |
|
"run": {"dir": "."}, |
|
"output_subdir": None, |
|
"mode": _RUN, |
|
} |
|
) |
|
|
|
def __post_init__(self): |
|
seed_all_random_engines( |
|
self.seed |
|
) |
|
|
|
run_auto_creation(self) |
|
|
|
def run(self) -> None: |
|
|
|
if no_accelerate: |
|
accelerator = None |
|
device = torch.device("cuda:0") |
|
else: |
|
accelerator = Accelerator(device_placement=False) |
|
logger.info(accelerator.state) |
|
device = accelerator.device |
|
|
|
logger.info(f"Running experiment on device: {device}") |
|
os.makedirs(self.exp_dir, exist_ok=True) |
|
|
|
|
|
if self.detect_anomaly: |
|
logger.info("Anomaly detection!") |
|
torch.autograd.set_detect_anomaly(self.detect_anomaly) |
|
|
|
|
|
datasets, dataloaders = self.data_source.get_datasets_and_dataloaders() |
|
|
|
|
|
model = self.model_factory( |
|
accelerator=accelerator, |
|
exp_dir=self.exp_dir, |
|
) |
|
|
|
stats = self.training_loop.load_stats( |
|
log_vars=model.log_vars, |
|
exp_dir=self.exp_dir, |
|
resume=self.model_factory.resume, |
|
resume_epoch=self.model_factory.resume_epoch, |
|
) |
|
start_epoch = stats.epoch + 1 |
|
|
|
model.to(device) |
|
|
|
|
|
optimizer, scheduler = self.optimizer_factory( |
|
accelerator=accelerator, |
|
exp_dir=self.exp_dir, |
|
last_epoch=start_epoch, |
|
model=model, |
|
resume=self.model_factory.resume, |
|
resume_epoch=self.model_factory.resume_epoch, |
|
) |
|
|
|
|
|
|
|
|
|
train_loader = dataloaders.train |
|
val_loader = dataloaders.val |
|
test_loader = dataloaders.test |
|
if accelerator is not None: |
|
( |
|
model, |
|
optimizer, |
|
train_loader, |
|
val_loader, |
|
) = accelerator.prepare(model, optimizer, train_loader, val_loader) |
|
|
|
|
|
self.training_loop.run( |
|
train_loader=train_loader, |
|
val_loader=val_loader, |
|
test_loader=test_loader, |
|
|
|
train_dataset=datasets.train, |
|
model=model, |
|
optimizer=optimizer, |
|
scheduler=scheduler, |
|
accelerator=accelerator, |
|
device=device, |
|
exp_dir=self.exp_dir, |
|
stats=stats, |
|
seed=self.seed, |
|
) |
|
|
|
|
|
def _setup_envvars_for_cluster() -> bool: |
|
""" |
|
Prepares to run on cluster if relevant. |
|
Returns whether FAIR cluster in use. |
|
""" |
|
|
|
|
|
try: |
|
import submitit |
|
except ImportError: |
|
return False |
|
|
|
try: |
|
|
|
job_env = submitit.JobEnvironment() |
|
except RuntimeError: |
|
return False |
|
|
|
os.environ["LOCAL_RANK"] = str(job_env.local_rank) |
|
os.environ["RANK"] = str(job_env.global_rank) |
|
os.environ["WORLD_SIZE"] = str(job_env.num_tasks) |
|
os.environ["MASTER_ADDR"] = "localhost" |
|
os.environ["MASTER_PORT"] = "42918" |
|
logger.info( |
|
"Num tasks %s, global_rank %s" |
|
% (str(job_env.num_tasks), str(job_env.global_rank)) |
|
) |
|
|
|
return True |
|
|
|
|
|
def dump_cfg(cfg: DictConfig) -> None: |
|
remove_unused_components(cfg) |
|
|
|
os.makedirs(cfg.exp_dir, exist_ok=True) |
|
try: |
|
cfg_filename = os.path.join(cfg.exp_dir, "expconfig.yaml") |
|
OmegaConf.save(config=cfg, f=cfg_filename) |
|
except PermissionError: |
|
warnings.warn("Can't dump config due to insufficient permissions!") |
|
|
|
|
|
expand_args_fields(Experiment) |
|
cs = hydra.core.config_store.ConfigStore.instance() |
|
cs.store(name="default_config", node=Experiment) |
|
|
|
|
|
@hydra.main(config_path="./configs/", config_name="default_config") |
|
def experiment(cfg: DictConfig) -> None: |
|
|
|
|
|
if "CUDA_DEVICE_ORDER" not in os.environ: |
|
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" |
|
|
|
if not _setup_envvars_for_cluster(): |
|
logger.info("Running locally") |
|
|
|
|
|
expand_args_fields(ImplicitronModelBase) |
|
expand_args_fields(AdaptiveRaySampler) |
|
expand_args_fields(MultiPassEmissionAbsorptionRenderer) |
|
expand_args_fields(ImplicitronDataSource) |
|
|
|
experiment = Experiment(**cfg) |
|
dump_cfg(cfg) |
|
experiment.run() |
|
|
|
|
|
if __name__ == "__main__": |
|
experiment() |
|
|