Upload to Main

This commit is contained in:
张菲
2025-10-07 22:42:55 +08:00
commit d3ddab7c5d
218 changed files with 125815 additions and 0 deletions

View File

@@ -0,0 +1,2 @@
DATASET:
NAME: "Caltech101"

View File

@@ -0,0 +1,2 @@
DATASET:
NAME: "DescribableTextures"

View File

@@ -0,0 +1,2 @@
DATASET:
NAME: "EuroSAT"

View File

@@ -0,0 +1,2 @@
DATASET:
NAME: "FGVCAircraft"

View File

@@ -0,0 +1,2 @@
DATASET:
NAME: "Food101"

View File

@@ -0,0 +1,2 @@
DATASET:
NAME: "ImageNet"

View File

@@ -0,0 +1,2 @@
DATASET:
NAME: "ImageNetA"

View File

@@ -0,0 +1,2 @@
DATASET:
NAME: "ImageNetR"

View File

@@ -0,0 +1,2 @@
DATASET:
NAME: "ImageNetSketch"

View File

@@ -0,0 +1,2 @@
DATASET:
NAME: "ImageNetV2"

View File

@@ -0,0 +1,2 @@
DATASET:
NAME: "OxfordFlowers"

View File

@@ -0,0 +1,2 @@
DATASET:
NAME: "OxfordPets"

View File

@@ -0,0 +1,2 @@
DATASET:
NAME: "VOC12"

View File

@@ -0,0 +1,2 @@
DATASET:
NAME: "StanfordCars"

View File

@@ -0,0 +1,2 @@
DATASET:
NAME: "SUN397"

View File

@@ -0,0 +1,2 @@
DATASET:
NAME: "UCF101"

View File

@@ -0,0 +1,100 @@
MODEL:
META_ARCHITECTURE: "GeneralizedVLRCNN"
WEIGHT: "swin_tiny_patch4_window7_224.pth"
RPN_ONLY: True
RPN_ARCHITECTURE: "VLDYHEAD"
BACKBONE:
CONV_BODY: "SWINT-FPN-RETINANET"
OUT_CHANNELS: 256
FREEZE_CONV_BODY_AT: -1
LANGUAGE_BACKBONE:
FREEZE: False
MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
MASK_SPECIAL: False
RPN:
USE_FPN: True
ANCHOR_SIZES: (64, 128, 256, 512, 1024)
ANCHOR_STRIDE: (8, 16, 32, 64, 128)
ASPECT_RATIOS: (1.0,)
SCALES_PER_OCTAVE: 1
DYHEAD:
CHANNELS: 256
NUM_CONVS: 6
USE_GN: True
USE_DYRELU: True
USE_DFCONV: True
USE_DYFUSE: True
TOPK: 9 # topk for selecting candidate positive samples from each level
SCORE_AGG: "MEAN"
LOG_SCALE: 0.0
FUSE_CONFIG:
EARLY_FUSE_ON: True
TYPE: "MHA-B"
USE_CLASSIFICATION_LOSS: False
USE_TOKEN_LOSS: False
USE_CONTRASTIVE_ALIGN_LOSS: False
CONTRASTIVE_HIDDEN_DIM: 64
USE_DOT_PRODUCT_TOKEN_LOSS: True
USE_FUSED_FEATURES_DOT_PRODUCT: True
USE_LAYER_SCALE: True
CLAMP_MIN_FOR_UNDERFLOW: True
CLAMP_MAX_FOR_OVERFLOW: True
CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
CLAMP_DOT_PRODUCT: True
USE_CHECKPOINT: True
TEST:
DURING_TRAINING: False
IMS_PER_BATCH: 64
# use for grounding model
DATASETS:
TRAIN: ("object365_dt_train", "mixed_train_no_coco", "flickr30k_train", )
TEST: ("coco_2014_val", )
DISABLE_SHUFFLE: False
ADD_DET_PROMPT: False
RANDOM_SAMPLE_NEG: 85
CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
SEPARATION_TOKENS: ". "
INPUT:
PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
PIXEL_STD: [ 57.375, 57.120, 58.395 ]
MIN_SIZE_TRAIN: 800
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
AUGMENT:
MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
DATALOADER:
SIZE_DIVISIBILITY: 32
SOLVER:
OPTIMIZER: ADAMW
BASE_LR: 0.0001
LANG_LR: 0.00001
WEIGHT_DECAY: 0.0001
STEPS: (0.67, 0.89)
MAX_EPOCH: 30
IMS_PER_BATCH: 64
WARMUP_ITERS: 2000
WARMUP_FACTOR: 0.001
USE_AMP: True
MODEL_EMA: 0.999
FIND_UNUSED_PARAMETERS: False
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 1.0
NORM_TYPE: 2.0

View File

@@ -0,0 +1,35 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 1
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 10
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TRAINER:
COCOOP:
N_CTX: 16
CTX_INIT: ""
PREC: "fp16"

View File

@@ -0,0 +1,35 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 1
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 10
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TRAINER:
COCOOP:
N_CTX: 4
CTX_INIT: ""
PREC: "fp16"

View File

@@ -0,0 +1,35 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 1
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 10
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TRAINER:
COCOOP:
N_CTX: 4
CTX_INIT: "a photo of a"
PREC: "fp16"

View File

@@ -0,0 +1,35 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 1
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 10
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TRAINER:
COCOOP:
N_CTX: 8
CTX_INIT: ""
PREC: "fp16"

View File

@@ -0,0 +1,29 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 200
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "RN101"

View File

@@ -0,0 +1,29 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 50
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "RN101"

View File

@@ -0,0 +1,29 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 200
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "RN50"

View File

@@ -0,0 +1,33 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 200
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "RN50"
TRAINER:
COOP:
CTX_INIT: "a photo of a"

View File

@@ -0,0 +1,29 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 100
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "RN50"

View File

@@ -0,0 +1,29 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 50
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "RN50"

View File

@@ -0,0 +1,33 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 50
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "RN50"
TRAINER:
COOP:
CTX_INIT: "a photo of a"

View File

@@ -0,0 +1,17 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 200
TEST:
BATCH_SIZE: 200
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
MODEL:
BACKBONE:
NAME: "RN50"

View File

@@ -0,0 +1,29 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 200
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "ViT-B/16"

View File

@@ -0,0 +1,29 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 100
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "ViT-B/16"

View File

@@ -0,0 +1,29 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 50
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "ViT-B/16"

View File

@@ -0,0 +1,29 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 200
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "ViT-B/32"

View File

@@ -0,0 +1,29 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 32
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.002
MAX_EPOCH: 50
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 5
MODEL:
BACKBONE:
NAME: "ViT-B/32"

View File

@@ -0,0 +1,39 @@
# Deep independent V-L Prompting
DATALOADER:
TRAIN_X:
BATCH_SIZE: 4
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.0035
MAX_EPOCH: 5
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TRAINER:
IVLP:
N_CTX_VISION: 2
N_CTX_TEXT: 2
CTX_INIT: "a photo of a"
PREC: "fp16"
PROMPT_DEPTH_VISION: 12
PROMPT_DEPTH_TEXT: 12

View File

@@ -0,0 +1,39 @@
# Deep language prompting
DATALOADER:
TRAIN_X:
BATCH_SIZE: 4
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.0025
MAX_EPOCH: 5
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TRAINER:
IVLP:
N_CTX_VISION: 0
N_CTX_TEXT: 4
CTX_INIT: "a photo of a"
PREC: "fp16"
PROMPT_DEPTH_VISION: 0
PROMPT_DEPTH_TEXT: 12

View File

@@ -0,0 +1,53 @@
DATASET:
SELECTION_BATCH_SIZE: 50
SUBSAMPLE_CLASSES: all
DATALOADER:
RETURN_IMG0: true
TRAIN_X:
BATCH_SIZE: 16
TEST:
BATCH_SIZE: 64
NUM_WORKERS: 2
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
# CUTOUT_N: 1
# CUTOUT_LEN: 128
OPTIM:
NAME: "sgd"
LR: 0.0026 #0.0035 0.0026 for crossdata
MAX_EPOCH: 5
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
CHECKPOINT_FREQ: 1
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TEST:
PER_CLASS_RESULT: false
FINAL_MODEL: "best_val"
TRAINER:
MAPLEG:
N_CTX: 4
CTX_INIT: "a photo of a"
PREC: "fp16"
PROMPT_DEPTH: 9

View File

@@ -0,0 +1,52 @@
DATASET:
SELECTION_BATCH_SIZE: 50
SUBSAMPLE_CLASSES: base
DATALOADER:
RETURN_IMG0: true
TRAIN_X:
BATCH_SIZE: 1
TEST:
BATCH_SIZE: 256
NUM_WORKERS: 4
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
# CUTOUT_N: 1
# CUTOUT_LEN: 128
OPTIM:
NAME: "sgd"
LR: 0.0035
MAX_EPOCH: 5
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TEST:
PER_CLASS_RESULT: false
FINAL_MODEL: "best_val"
TRAINER:
MAPLE:
N_CTX: 2
CTX_INIT: "a photo of a"
PREC: "fp16"
PROMPT_DEPTH: 9

View File

@@ -0,0 +1,41 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 4
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 4
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.0035
MAX_EPOCH: 5
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
TEST:
FINAL_MODEL: "best_val"
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TRAINER:
MAPLE:
N_CTX: 2
CTX_INIT: "a photo of a"
PREC: "fp16"
PROMPT_DEPTH: 9

View File

@@ -0,0 +1,36 @@
DATALOADER:
TRAIN_X:
BATCH_SIZE: 4
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.0026
MAX_EPOCH: 2
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TRAINER:
MAPLE:
N_CTX: 2
CTX_INIT: "a photo of a"
PREC: "fp16"
PROMPT_DEPTH: 9

View File

@@ -0,0 +1,53 @@
DATASET:
SELECTION_BATCH_SIZE: 50
SUBSAMPLE_CLASSES: all
DATALOADER:
RETURN_IMG0: true
TRAIN_X:
BATCH_SIZE: 4
TEST:
BATCH_SIZE: 128
NUM_WORKERS: 4
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
# CUTOUT_N: 1
# CUTOUT_LEN: 128
OPTIM:
NAME: "sgd"
LR: 0.0035 #0.0035 0.0026 for crossdata
MAX_EPOCH: 10
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
CHECKPOINT_FREQ: 1
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TEST:
PER_CLASS_RESULT: false
FINAL_MODEL: "best_val"
TRAINER:
MAPLE:
N_CTX: 2
CTX_INIT: "A photo of a"
PREC: "fp16"
PROMPT_DEPTH: 9

View File

@@ -0,0 +1,37 @@
# Deep vision prompting
DATALOADER:
TRAIN_X:
BATCH_SIZE: 4
TEST:
BATCH_SIZE: 100
NUM_WORKERS: 8
INPUT:
SIZE: (224, 224)
INTERPOLATION: "bicubic"
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
TRANSFORMS: ["random_resized_crop", "random_flip", "normalize"]
OPTIM:
NAME: "sgd"
LR: 0.0025
MAX_EPOCH: 5
LR_SCHEDULER: "cosine"
WARMUP_EPOCH: 1
WARMUP_TYPE: "constant"
WARMUP_CONS_LR: 1e-5
TRAIN:
PRINT_FREQ: 20
MODEL:
BACKBONE:
NAME: "ViT-B/16"
TRAINER:
VPT:
N_CTX_VISION: 8
CTX_INIT: "a photo of a"
PREC: "fp16"
PROMPT_DEPTH_VISION: 12