configs/experiment/SAVi_code/cater_savi_with_predictor.yaml

# @package _global_
# An example implementaiton of SAVi that leverages a model definition in code.
# The code can be found in `ocl/models/savi.py`, the config is used to
# instantiate the submodules used by the code.
defaults:
  - /experiment/_output_path  # (1)!
  - /training_config # (2)!
  - /dataset: cater  # (3)!
  - _self_


trainer:
  devices: 8
  gradient_clip_val: 0.05
  gradient_clip_algorithm: norm
  max_epochs:
  max_steps: 199999
  callbacks:
    - _target_: pytorch_lightning.callbacks.LearningRateMonitor
      logging_interval: step

dataset:
  num_workers: 4
  batch_size: 8

  train_transforms:
    03_preprocessing:
      _target_: ocl.transforms.SimpleTransform
      transforms:
        image:
          _target_: ocl.preprocessing.VideoToTensor
      batch_transform: false
    02_random_strided_window:
      _target_: ocl.transforms.SampleConsecutive
      fields:
        - image
      n_consecutive: 6
  eval_transforms:
    03_preprocessing:
      _target_: ocl.transforms.SimpleTransform
      transforms:
        image:
          _target_: ocl.preprocessing.VideoToTensor
        mask:
          _target_: ocl.preprocessing.MultiMaskToTensor

      batch_transform: false
models:
  _target_: ocl.models.SAVi
  conditioning:
    _target_: routed.ocl.conditioning.LearntConditioning
    object_dim: 128
    n_slots: 11

    batch_size_path: input.batch_size
  feature_extractor:
    # Use the smaller verion of the feature extractor architecture.
    _target_: ocl.feature_extractors.SAViFeatureExtractor
    larger_input_arch: false

  perceptual_grouping:
    _target_: ocl.perceptual_grouping.SlotAttentionGrouping
    feature_dim: 32
    object_dim: ${models.conditioning.object_dim}
    iters: 2
    kvq_dim: 128
    use_projection_bias: false
    positional_embedding:
      _target_: ocl.neural_networks.wrappers.Sequential
      _args_:
        - _target_: ocl.neural_networks.positional_embedding.SoftPositionEmbed
          n_spatial_dims: 2
          feature_dim: 32
          savi_style: true
        - _target_: ocl.neural_networks.build_two_layer_mlp
          input_dim: 32
          output_dim: 32
          hidden_dim: 64
          initial_layer_norm: true
    ff_mlp:

  decoder:
    _target_: ocl.decoding.SlotAttentionDecoder
    decoder:
      _target_: ocl.decoding.get_savi_decoder_backbone
      object_dim: ${models.perceptual_grouping.object_dim}
      larger_input_arch: false
    positional_embedding:
      _target_: ocl.neural_networks.positional_embedding.SoftPositionEmbed
      n_spatial_dims: 2
      feature_dim: ${models.perceptual_grouping.object_dim}
      cnn_channel_order: true
      savi_style: true

  transition_model:
    _target_: torch.nn.TransformerEncoderLayer
    d_model: 128
    nhead: 4
    dim_feedforward: 256
    batch_first: true

losses:
  mse:
    _target_: routed.ocl.losses.ReconstructionLoss
    loss_type: mse_sum
    input_path: decoder.reconstruction
    target_path: input.image

visualizations:
  input:
    _target_: routed.ocl.visualizations.Video
    denormalization:
    video_path: input.image
  reconstruction:
    _target_: routed.ocl.visualizations.Video
    denormalization: ${..input.denormalization}
    video_path: decoder.reconstruction
  objects:
    _target_: routed.ocl.visualizations.VisualObject
    denormalization: ${..input.denormalization}
    object_path: decoder.object_reconstructions
    mask_path: decoder.masks
  objectmot:
    _target_: routed.ocl.visualizations.TrackedObject_from_Mask
    n_clips: 1
    denormalization:
    video_path: input.image
    object_masks_path: decoder.masks

evaluation_metrics:
  ari:
    _target_: routed.ocl.metrics.ARIMetric
    prediction_path: decoder.masks
    target_path: input.mask
  mot:
    _target_: routed.ocl.metrics.MOTMetric
    prediction_path: decoder.masks
    target_path: input.mask
optimizers:
  opt0:
    _target_: ocl.optimization.OptimizationWrapper
    optimizer:
      _target_: torch.optim.Adam
      _partial_: true
      lr: 0.0002
    lr_scheduler:
      _target_: ocl.scheduling.cosine_annealing_with_optional_warmup
      _partial_: true
      T_max: 200000
      eta_min: 0.0
      warmup_steps: 2500