configs/experiment/projects/bridging/dinosaur/movi_e_feat_rec.yaml
# @package _global_
# ViT feature reconstruction on MOVI-E.
defaults:
- /experiment/projects/bridging/dinosaur/_base_feature_recon # (1)!
- /dataset: movi_e_image # (2)!
- /experiment/projects/bridging/dinosaur/_preprocessing_movi_dino_feature_recon # (3)!
- /experiment/projects/bridging/dinosaur/_metrics_clevr_patch # (4)!
- _self_
# The following parameters assume training on 8 GPUs, leading to an effective batch size of 64.
trainer:
devices: 8
max_steps: 500000
max_epochs:
dataset:
num_workers: 4
batch_size: 8
models:
conditioning:
_target_: routed.ocl.conditioning.RandomConditioning
n_slots: 24
object_dim: 128
batch_size_path: input.batch_size
feature_extractor:
model_name: vit_small_patch8_224_dino
object_decoder:
_target_: routed.ocl.decoding.PatchDecoder
num_patches: 784
decoder:
_target_: ocl.neural_networks.build_mlp
_partial_: true
features: [1024, 1024, 1024]
object_features_path: perceptual_grouping.objects
masks_as_image:
_target_: routed.ocl.utils.resizing.Resize
input_path: object_decoder.masks
size: 128
resize_mode: bilinear
patch_mode: true
- /experiment/projects/bridging/dinosaur/_base_feature_recon
- /dataset/movi_e_image
- /experiment/projects/bridging/dinosaur/_preprocessing_movi_dino_feature_recon
- /experiment/projects/bridging/dinosaur/_metrics_clevr_patch