configs/experiment/projects/bridging/dinosaur/movi_e_feat_rec.yaml

# @package _global_
# ViT feature reconstruction on MOVI-E.
defaults:
  - /experiment/projects/bridging/dinosaur/_base_feature_recon  # (1)!
  - /dataset: movi_e_image  # (2)!
  - /experiment/projects/bridging/dinosaur/_preprocessing_movi_dino_feature_recon # (3)!
  - /experiment/projects/bridging/dinosaur/_metrics_clevr_patch # (4)!
  - _self_

# The following parameters assume training on 8 GPUs, leading to an effective batch size of 64.
trainer:
  devices: 8
  max_steps: 500000
  max_epochs:

dataset:
  num_workers: 4
  batch_size: 8

models:
  conditioning:
    _target_: routed.ocl.conditioning.RandomConditioning
    n_slots: 24
    object_dim: 128

    batch_size_path: input.batch_size
  feature_extractor:
    model_name: vit_small_patch8_224_dino

  object_decoder:
    _target_: routed.ocl.decoding.PatchDecoder
    num_patches: 784
    decoder:
      _target_: ocl.neural_networks.build_mlp
      _partial_: true
      features: [1024, 1024, 1024]
    object_features_path: perceptual_grouping.objects

  masks_as_image:
    _target_: routed.ocl.utils.resizing.Resize
    input_path: object_decoder.masks
    size: 128
    resize_mode: bilinear
    patch_mode: true