Upload to Main

This commit is contained in:
张菲
2025-10-07 22:42:55 +08:00
commit d3ddab7c5d
218 changed files with 125815 additions and 0 deletions

View File

@@ -0,0 +1,35 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 1
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 10
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TRAINER:
COCOOP:
N_CTX: 16
CTX_INIT: ""
PREC: "fp16"

View File

@@ -0,0 +1,35 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 1
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 10
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TRAINER:
COCOOP:
N_CTX: 4
CTX_INIT: ""
PREC: "fp16"

View File

@@ -0,0 +1,35 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 1
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 10
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TRAINER:
COCOOP:
N_CTX: 4
CTX_INIT: "a photo of a"
PREC: "fp16"

View File

@@ -0,0 +1,35 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 1
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 10
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TRAINER:
COCOOP:
N_CTX: 8
CTX_INIT: ""
PREC: "fp16"

View File

@@ -0,0 +1,29 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 200
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "RN101"

View File

@@ -0,0 +1,29 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 50
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "RN101"

View File

@@ -0,0 +1,29 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 200
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "RN50"

View File

@@ -0,0 +1,33 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 200
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "RN50"
TRAINER:
COOP:
CTX_INIT: "a photo of a"

View File

@@ -0,0 +1,29 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 100
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "RN50"

View File

@@ -0,0 +1,29 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 50
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "RN50"

View File

@@ -0,0 +1,33 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 50
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "RN50"
TRAINER:
COOP:
CTX_INIT: "a photo of a"

View File

@@ -0,0 +1,17 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 200
TEST:
BATCH_SIZE: 200
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
MODEL:
BACKBONE:
NAME: "RN50"

View File

@@ -0,0 +1,29 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 200
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "ViT-B/16"

View File

@@ -0,0 +1,29 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 100
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "ViT-B/16"

View File

@@ -0,0 +1,29 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 50
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "ViT-B/16"

View File

@@ -0,0 +1,29 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 200
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "ViT-B/32"

View File

@@ -0,0 +1,29 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 50
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "ViT-B/32"

View File

@@ -0,0 +1,39 @@
# Deep independent V-L Prompting
DATALOADER:
TRAIN_X:
BATCH_SIZE: 4
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.0035
MAX_EPOCH: 5
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TRAINER:
IVLP:
N_CTX_VISION: 2
N_CTX_TEXT: 2
CTX_INIT: "a photo of a"
PREC: "fp16"
PROMPT_DEPTH_VISION: 12
PROMPT_DEPTH_TEXT: 12

View File

@@ -0,0 +1,39 @@
# Deep language prompting
DATALOADER:
TRAIN_X:
BATCH_SIZE: 4
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.0025
MAX_EPOCH: 5
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TRAINER:
IVLP:
N_CTX_VISION: 0
N_CTX_TEXT: 4
CTX_INIT: "a photo of a"
PREC: "fp16"
PROMPT_DEPTH_VISION: 0
PROMPT_DEPTH_TEXT: 12

View File

@@ -0,0 +1,53 @@
DATASET:
SELECTION_BATCH_SIZE: 50
SUBSAMPLE_CLASSES: all
DATALOADER:
RETURN_IMG0: true
TRAIN_X:
BATCH_SIZE: 16
TEST:
BATCH_SIZE: 64
NUM_WORKERS: 2
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
# CUTOUT_N: 1
# CUTOUT_LEN: 128
OPTIM:
NAME: "sgd"
LR: 0.0026 #0.0035 0.0026 for crossdata
MAX_EPOCH: 5
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
CHECKPOINT_FREQ: 1
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TEST:
PER_CLASS_RESULT: false
FINAL_MODEL: "best_val"
TRAINER:
MAPLEG:
N_CTX: 4
CTX_INIT: "a photo of a"
PREC: "fp16"
PROMPT_DEPTH: 9

View File

@@ -0,0 +1,52 @@
DATASET:
SELECTION_BATCH_SIZE: 50
SUBSAMPLE_CLASSES: base
DATALOADER:
RETURN_IMG0: true
TRAIN_X:
BATCH_SIZE: 1
TEST:
BATCH_SIZE: 256
NUM_WORKERS: 4
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
# CUTOUT_N: 1
# CUTOUT_LEN: 128
OPTIM:
NAME: "sgd"
LR: 0.0035
MAX_EPOCH: 5
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TEST:
PER_CLASS_RESULT: false
FINAL_MODEL: "best_val"
TRAINER:
MAPLE:
N_CTX: 2
CTX_INIT: "a photo of a"
PREC: "fp16"
PROMPT_DEPTH: 9

View File

@@ -0,0 +1,41 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 4
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 4
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.0035
MAX_EPOCH: 5
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
TEST:
FINAL_MODEL: "best_val"
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TRAINER:
MAPLE:
N_CTX: 2
CTX_INIT: "a photo of a"
PREC: "fp16"
PROMPT_DEPTH: 9

View File

@@ -0,0 +1,36 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 4
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.0026
MAX_EPOCH: 2
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TRAINER:
MAPLE:
N_CTX: 2
CTX_INIT: "a photo of a"
PREC: "fp16"
PROMPT_DEPTH: 9

View File

@@ -0,0 +1,53 @@
DATASET:
SELECTION_BATCH_SIZE: 50
SUBSAMPLE_CLASSES: all
DATALOADER:
RETURN_IMG0: true
TRAIN_X:
BATCH_SIZE: 4
TEST:
BATCH_SIZE: 128
NUM_WORKERS: 4
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
# CUTOUT_N: 1
# CUTOUT_LEN: 128
OPTIM:
NAME: "sgd"
LR: 0.0035 #0.0035 0.0026 for crossdata
MAX_EPOCH: 10
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
CHECKPOINT_FREQ: 1
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TEST:
PER_CLASS_RESULT: false
FINAL_MODEL: "best_val"
TRAINER:
MAPLE:
N_CTX: 2
CTX_INIT: "A photo of a"
PREC: "fp16"
PROMPT_DEPTH: 9

View File

@@ -0,0 +1,37 @@
# Deep vision prompting
DATALOADER:
TRAIN_X:
BATCH_SIZE: 4
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.0025
MAX_EPOCH: 5
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TRAINER:
VPT:
N_CTX_VISION: 8
CTX_INIT: "a photo of a"
PREC: "fp16"
PROMPT_DEPTH_VISION: 12