rename to dzgcoop

2026-02-24 20:35:56 +08:00
13 changed files with 102 additions and 79 deletions
@@ -1,4 +1,4 @@
-# PromptSRC: Prompting with Self-regularizing constraints
+# DZGCoOp: Dual-branch Zero-shot Guidance CoOp
 DATALOADER:
  TRAIN_X:
    BATCH_SIZE: 4
@@ -30,13 +30,15 @@ MODEL:
    NAME: "ViT-B/16"

 TRAINER:
-  PROMPTSRC:
+  DZGCOOP:
    N_CTX_VISION: 4
    N_CTX_TEXT: 4
    CTX_INIT: "a photo of a"
    PREC: "fp16"
    PROMPT_DEPTH_VISION: 9
    PROMPT_DEPTH_TEXT: 9
-    TEXT_LOSS_WEIGHT: 25
    IMAGE_LOSS_WEIGHT: 10
-    LAST_K: 5
+    TEXT_LOSS_WEIGHT_STRONG: 10
+    TEXT_LOSS_WEIGHT_WEAK: 25
+    GPA_MEAN: 15
+    GPA_STD: 1
@@ -1,4 +1,4 @@
-# PromptSRC: Prompting with Self-regularizing constraints
+# DZGCoOp: Dual-branch Zero-shot Guidance CoOp
 DATALOADER:
  TRAIN_X:
    BATCH_SIZE: 4
@@ -31,7 +31,7 @@ MODEL:
    NAME: "ViT-B/16"

 TRAINER:
-  PROMPTSRC:
+  DZGCOOP:
    N_CTX_VISION: 4
    N_CTX_TEXT: 4
    CTX_INIT: "a photo of a"
@@ -40,4 +40,5 @@ TRAINER:
    PROMPT_DEPTH_TEXT: 3
    TEXT_LOSS_WEIGHT: 25
    IMAGE_LOSS_WEIGHT: 10
-    LAST_K: 5
+    GPA_MEAN: 6
+    GPA_STD: 10
@@ -1,4 +1,4 @@
-# PromptSRC: Prompting with Self-regularizing constraints
+# DZGCoOp: Dual-branch Zero-shot Guidance CoOp
 DATALOADER:
  TRAIN_X:
    BATCH_SIZE: 4
@@ -30,7 +30,7 @@ MODEL:
    NAME: "ViT-B/16"

 TRAINER:
-  PROMPTSRC:
+  DZGCOOP:
    N_CTX_VISION: 4
    N_CTX_TEXT: 4
    CTX_INIT: "a photo of a"
@@ -39,4 +39,5 @@ TRAINER:
    PROMPT_DEPTH_TEXT: 3
    TEXT_LOSS_WEIGHT: 25
    IMAGE_LOSS_WEIGHT: 10
-    LAST_K: 5
+    GPA_MEAN: 6
+    GPA_STD: 10
@@ -11,7 +11,7 @@ Training PromptSRC on ImageNet for 20 epochs takes around 6 hours for a single s
 ## PromptSRC

 #### (1) Base-to-Novel class generalization setting
-The base-to-novel PromptSRC configuration is provided in config file at `configs/trainers/PromptSRC/vit_b16_c2_ep20_batch4_4+4ctx.yaml`. All hyper-parameters such as LAST_K, SCL loss weights coefficients, prompt length and prompt depth etc., can be modified using this config file.
+The base-to-novel PromptSRC configuration is provided in config file at `configs/trainers/PromptSRC/vit_b16_c2_ep20_batch4_4+4ctx.yaml`. All hyper-parameters such as GPA STD, GPA Mean, SCL loss weights coefficients, prompt length and prompt depth etc., can be modified using this config file.

 Run the commands below to train PromptSRC on ImageNet.

@@ -109,7 +109,7 @@ def print_model_results(results, model_name):

 def main():
    root_dir = 'output'  # 修改为你的output目录路径
-    target_model = 'PromptSRC'  # 指定要分析的模型
+    target_model = 'DZGCoOp'  # 指定要分析的模型
    
    results = collect_model_results(root_dir, target_model)
    print_model_results(results, target_model)
@@ -9,14 +9,14 @@ datasets=(
    "caltech101"
    "fgvc_aircraft"
    "stanford_cars"
-    "sun397"
-    "imagenet"
+    # "sun397"
+    # "imagenet"
 )

 for dataset in "${datasets[@]}"; do
    for seed in "${seeds[@]}"; do
-        bash scripts/promptsrc/base2new_train.sh "$dataset" "$seed"
-        bash scripts/promptsrc/base2new_test.sh "$dataset" "$seed"
+        bash scripts/dzgcoop/base2new_train.sh "$dataset" "$seed"
+        bash scripts/dzgcoop/base2new_test.sh "$dataset" "$seed"
    done
 done

@@ -3,7 +3,7 @@

 # custom config
 DATA="~/Datasets/CoOp"
-TRAINER=PromptSRC
+TRAINER=DZGCoOp

 DATASET=$1
 SEED=$2
@@ -2,7 +2,7 @@

 # custom config
 DATA="~/Datasets/CoOp"
-TRAINER=PromptSRC
+TRAINER=DZGCoOp

 DATASET=$1
 SEED=$2
@@ -2,7 +2,7 @@


 DATA=" ~/Datasets/CoOp"
-TRAINER=PromptSRC
+TRAINER=DZGCoOp
 SRC_DATASETS=imagenet
 SHOTS=16
 CFG=vit_b16_c2_ep20_batch4_4+4ctx_cross_datasets
@@ -3,7 +3,7 @@

 # custom config
 DATA=" ~/Datasets/CoOp"
-TRAINER=PromptSRC
+TRAINER=DZGCoOp


 SRC_DATASETS=imagenet 
@@ -3,7 +3,7 @@

 # custom config
 DATA=" ~/Datasets/CoOp"
-TRAINER=PromptSRC
+TRAINER=DZGCoOp


 SRC_DATASETS=imagenet 
@@ -28,7 +28,7 @@ import trainers.cocoop
 import trainers.zsclip
 import trainers.maple
 import trainers.independentVL
-import trainers.promptsrc
+import trainers.dzgcoop


 def print_args(args, cfg):
@@ -110,19 +110,19 @@ def extend_cfg(cfg):
    cfg.TRAINER.MAPLE.PROMPT_DEPTH = 9  # Max 12, minimum 0, for 1 it will act as shallow MaPLe (J=1)
    cfg.DATASET.SUBSAMPLE_CLASSES = "all"  # all, base or new

-    # Config for PromptSRC
-    cfg.TRAINER.PROMPTSRC = CN()
-    cfg.TRAINER.PROMPTSRC.N_CTX_VISION = 4  # number of context vectors at the vision branch
-    cfg.TRAINER.PROMPTSRC.N_CTX_TEXT = 4  # number of context vectors at the language branch
-    cfg.TRAINER.PROMPTSRC.CTX_INIT = "a photo of a"  # initialization words
-    cfg.TRAINER.PROMPTSRC.PREC = "fp16"  # fp16, fp32, amp
-    cfg.TRAINER.PROMPTSRC.PROMPT_DEPTH_VISION = 9  # Max 12, minimum 0, for 0 it will be using shallow IVLP prompting (J=1)
-    cfg.TRAINER.PROMPTSRC.PROMPT_DEPTH_TEXT = 9  # Max 12, minimum 0, for 0 it will be using shallow IVLP prompting (J=1)
-    cfg.TRAINER.PROMPTSRC.TEXT_LOSS_WEIGHT = 25
-    cfg.TRAINER.PROMPTSRC.TEXT_LOSS_WEIGHT_STRONG = 25  # lambda2: strong text constraint weight
-    cfg.TRAINER.PROMPTSRC.TEXT_LOSS_WEIGHT_WEAK = 2.5  # lambda3: weak text constraint weight
-    cfg.TRAINER.PROMPTSRC.IMAGE_LOSS_WEIGHT = 10
-    cfg.TRAINER.PROMPTSRC.LAST_K = 5
+    # Config for DZGCoOp
+    cfg.TRAINER.DZGCOOP = CN()
+    cfg.TRAINER.DZGCOOP.N_CTX_VISION = 4  # number of context vectors at the vision branch
+    cfg.TRAINER.DZGCOOP.N_CTX_TEXT = 4  # number of context vectors at the language branch
+    cfg.TRAINER.DZGCOOP.CTX_INIT = "a photo of a"  # initialization words
+    cfg.TRAINER.DZGCOOP.PREC = "fp16"  # fp16, fp32, amp
+    cfg.TRAINER.DZGCOOP.PROMPT_DEPTH_VISION = 9  # Max 12, minimum 0, for 0 it will be using shallow IVLP prompting (J=1)
+    cfg.TRAINER.DZGCOOP.PROMPT_DEPTH_TEXT = 9  # Max 12, minimum 0, for 0 it will be using shallow IVLP prompting (J=1)
+    cfg.TRAINER.DZGCOOP.TEXT_LOSS_WEIGHT_STRONG = 25  # lambda2: strong text constraint weight
+    cfg.TRAINER.DZGCOOP.TEXT_LOSS_WEIGHT_WEAK = 10  # lambda3: weak text constraint weight
+    cfg.TRAINER.DZGCOOP.IMAGE_LOSS_WEIGHT = 10
+    cfg.TRAINER.DZGCOOP.GPA_MEAN = 15
+    cfg.TRAINER.DZGCOOP.GPA_STD = 1
    cfg.DATASET.SUBSAMPLE_CLASSES = "all"  # all, base or new

    # Config for independent Vision Language prompting (independent-vlp)
@@ -51,10 +51,10 @@ def load_clip_to_cpu(cfg, zero_shot_model=False):
        state_dict = torch.load(model_path, map_location="cpu")
    if not zero_shot_model:
        design_details = {"trainer": 'IVLP',
-                          "vision_depth": cfg.TRAINER.PROMPTSRC.PROMPT_DEPTH_VISION,
-                          "language_depth": cfg.TRAINER.PROMPTSRC.PROMPT_DEPTH_TEXT,
-                          "vision_ctx": cfg.TRAINER.PROMPTSRC.N_CTX_VISION,
-                          "language_ctx": cfg.TRAINER.PROMPTSRC.N_CTX_TEXT}
+                           "vision_depth": cfg.TRAINER.DZGCOOP.PROMPT_DEPTH_VISION,
+                           "language_depth": cfg.TRAINER.DZGCOOP.PROMPT_DEPTH_TEXT,
+                           "vision_ctx": cfg.TRAINER.DZGCOOP.N_CTX_VISION,
+                           "language_ctx": cfg.TRAINER.DZGCOOP.N_CTX_TEXT}
        model = clip.build_model(state_dict or model.state_dict(), design_details)
    else:
        # Return original CLIP model for generating frozen VL features
@@ -95,11 +95,11 @@ class VLPromptLearner(nn.Module):
        super().__init__()
        n_cls = len(classnames)
        # Make sure Language depth >= 1
-        assert cfg.TRAINER.PROMPTSRC.PROMPT_DEPTH_TEXT >= 1, "In Independent VL prompting, Language prompt depth should be >=1" \
+        assert cfg.TRAINER.DZGCOOP.PROMPT_DEPTH_TEXT >= 1, "In Independent VL prompting, Language prompt depth should be >=1" \
                                                        "\nPlease use VPT trainer if you want to learn only vision " \
                                                        "branch"
-        n_ctx = cfg.TRAINER.PROMPTSRC.N_CTX_TEXT
-        ctx_init = cfg.TRAINER.PROMPTSRC.CTX_INIT
+        n_ctx = cfg.TRAINER.DZGCOOP.N_CTX_TEXT
+        ctx_init = cfg.TRAINER.DZGCOOP.CTX_INIT
        dtype = clip_model.dtype
        ctx_dim = clip_model.ln_final.weight.shape[0]
        clip_imsize = clip_model.visual.input_resolution
@@ -126,7 +126,7 @@ class VLPromptLearner(nn.Module):
        print(f'Strong branch initial text context: "{prompt_prefix_strong}"')
        print(f'Weak branch initial text context: "{prompt_prefix_weak}"')
        print(f"Number of context words (tokens) for Language prompting: {n_ctx}")
-        print(f"Number of context words (tokens) for Vision prompting: {cfg.TRAINER.PROMPTSRC.N_CTX_VISION}")
+        print(f"Number of context words (tokens) for Vision prompting: {cfg.TRAINER.DZGCOOP.N_CTX_VISION}")
        self.ctx_strong = nn.Parameter(ctx_vectors_strong)
        self.ctx_weak = nn.Parameter(ctx_vectors_weak)

@@ -261,9 +261,9 @@ class CustomCLIP(nn.Module):


@TRAINER_REGISTRY.register()
-class PromptSRC(TrainerX):
+class DZGCoOp(TrainerX):
    def check_cfg(self, cfg):
-        assert cfg.TRAINER.PROMPTSRC.PREC in ["fp16", "fp32", "amp"]
+        assert cfg.TRAINER.DZGCOOP.PREC in ["fp16", "fp32", "amp"]

    def build_model(self):
        cfg = self.cfg
@@ -272,7 +272,7 @@ class PromptSRC(TrainerX):
        print(f"Loading CLIP (backbone: {cfg.MODEL.BACKBONE.NAME})")
        clip_model = load_clip_to_cpu(cfg)

-        if cfg.TRAINER.PROMPTSRC.PREC == "fp32" or cfg.TRAINER.PROMPTSRC.PREC == "amp":
+        if cfg.TRAINER.DZGCOOP.PREC == "fp32" or cfg.TRAINER.DZGCOOP.PREC == "amp":
            # CLIP's default precision is fp16
            clip_model.float()

@@ -311,15 +311,21 @@ class PromptSRC(TrainerX):
        # Cosine scheduler
        self.total_epochs = cfg.OPTIM.MAX_EPOCH
        self.step_counter = 1
-        self.max_k = cfg.TRAINER.PROMPTSRC.LAST_K
-        self.last_k_models = []
-        self.scaler = GradScaler() if cfg.TRAINER.PROMPTSRC.PREC == "amp" else None
+        N = cfg.OPTIM.MAX_EPOCH
+        mean = cfg.TRAINER.DZGCOOP.GPA_MEAN
+        stdev = cfg.TRAINER.DZGCOOP.GPA_STD
+        gauss = self.get_gauss(mean, stdev)
+        self.gauss = np.array([gauss(a) for a in range(1, N + 1)])
+        self.gauss = self.gauss / sum(self.gauss)
+        self.scaler = GradScaler() if cfg.TRAINER.DZGCOOP.PREC == "amp" else None
        # Note that multi-gpu training could be slow because CLIP's size is
        # big, which slows down the copy operation in DataParallel
        device_count = torch.cuda.device_count()
        if device_count > 1:
            print(f"Multiple GPUs detected (n_gpus={device_count}), use all of them!")
            self.model = nn.DataParallel(self.model)
+        # Keep model with GPA
+        self.previous_model_gpa = None

    def forward_backward(self, batch):
        image, label = self.parse_batch_train(batch)
@@ -328,7 +334,7 @@ class PromptSRC(TrainerX):
        optim = self.optim
        scaler = self.scaler

-        prec = self.cfg.TRAINER.PROMPTSRC.PREC
+        prec = self.cfg.TRAINER.DZGCOOP.PREC
        if prec == "amp":
            with autocast():
                loss = model(image, label)
@@ -340,23 +346,23 @@ class PromptSRC(TrainerX):
            loss_ce, text_features_strong, text_features_weak, fixed_embeddings, zs_image_embedd, image_ft, \
            zero_shot_logits, logits_strong, logits_weak, logits_final = model(image, label)

-            lambda1 = self.cfg.TRAINER.PROMPTSRC.IMAGE_LOSS_WEIGHT
-            lambda2 = self.cfg.TRAINER.PROMPTSRC.TEXT_LOSS_WEIGHT_STRONG
-            lambda3 = self.cfg.TRAINER.PROMPTSRC.TEXT_LOSS_WEIGHT_WEAK
+            lambda1 = self.cfg.TRAINER.DZGCOOP.IMAGE_LOSS_WEIGHT
+            lambda2 = self.cfg.TRAINER.DZGCOOP.TEXT_LOSS_WEIGHT_STRONG
+            lambda3 = self.cfg.TRAINER.DZGCOOP.TEXT_LOSS_WEIGHT_WEAK

-            loss_scl_image = F.l1_loss(image_ft, zs_image_embedd.cuda(), reduction='mean') * lambda1
-            loss_scl_text_strong = F.l1_loss(text_features_strong, fixed_embeddings.cuda(), reduction='mean') * lambda2
-            loss_scl_text_weak = F.l1_loss(text_features_weak, fixed_embeddings.cuda(), reduction='mean') * lambda3
+            L_zvg = F.l1_loss(image_ft, zs_image_embedd.cuda(), reduction='mean') * lambda1
+            L_sg_strong = F.l1_loss(text_features_strong, fixed_embeddings.cuda(), reduction='mean') * lambda2
+            L_sg_weak = F.l1_loss(text_features_weak, fixed_embeddings.cuda(), reduction='mean') * lambda3

-            L_SCL_logits = F.kl_div(
+            L_zpg = F.kl_div(
                F.log_softmax(logits_final / 1, dim=1),
                F.log_softmax(zero_shot_logits / 1, dim=1),
                reduction='sum',
                log_target=True
            ) * (1 * 1) / logits_final.numel()

-            L_SCL = (L_SCL_logits + loss_scl_text_strong + loss_scl_text_weak + loss_scl_image)
-            loss = (loss_ce + L_SCL)
+            L_zg = (L_zpg + L_sg_strong + L_sg_weak + L_zvg)
+            loss = (loss_ce + L_zg)
            optim.zero_grad()
            loss.backward()
            optim.step()
@@ -365,32 +371,45 @@ class PromptSRC(TrainerX):

        if (self.batch_idx + 1) == self.num_batches:
            self.update_lr()
+            # Means one epoch is completed, perform GPA
            self.step_counter = self.step_counter + 1
+            current_epoch_weight = self.gauss[self.step_counter - 2]
            current_model_weights = copy.deepcopy(model.state_dict())
-            for key in current_model_weights:
-                current_model_weights[key] = current_model_weights[key].cpu()
-            self.last_k_models.append(current_model_weights)
-            if len(self.last_k_models) > self.max_k:
-                self.last_k_models.pop(0)
-            torch.cuda.empty_cache()
+            weighted_state_dict = self.state_dict_weighting(current_model_weights, current_epoch_weight)
+            if self.previous_model_gpa is None:
+                self.previous_model_gpa = weighted_state_dict
+            else:
+                self.previous_model_gpa = self.state_dict_add(weighted_state_dict, self.previous_model_gpa)

        if self.step_counter == self.model.total_epochs + 1:
-            print(f"Using Last-K Averaging (K={len(self.last_k_models)}) model for final inference...")
-            averaged_state_dict = self._average_last_k_models()
-            for key in averaged_state_dict:
-                averaged_state_dict[key] = averaged_state_dict[key].cuda()
-            model.load_state_dict(averaged_state_dict)
-            self.model.load_state_dict(averaged_state_dict)
+            print("Using GPA model for final inference...")
+            model.load_state_dict(self.previous_model_gpa)
+            self.model.load_state_dict(self.previous_model_gpa)
        return loss_summary

-    def _average_last_k_models(self):
-        if not self.last_k_models:
-            return {}
-        averaged_dict = {}
-        for key in self.last_k_models[0]:
-            stacked = torch.stack([model_state[key] for model_state in self.last_k_models])
-            averaged_dict[key] = torch.mean(stacked, dim=0)
-        return averaged_dict
+    def state_dict_weighting(self, main_dict, weightage, prompt_only=False):
+        # Average all parameters
+        updated_dict = copy.deepcopy(main_dict)
+        if not prompt_only:
+            for key in main_dict:
+                updated_dict[key] = main_dict[key] * weightage
+            return updated_dict
+        else:
+            return main_dict * weightage
+
+    def state_dict_add(self, dict1, dict2, prompt_only=False):
+        # Average all parameters
+        if not prompt_only:
+            modified_dict = dict2
+            for key in dict1:
+                modified_dict[key] = (modified_dict[key] + dict1[key])
+            return modified_dict
+        else:
+            return dict1 + dict2
+
+    def get_gauss(self, mu, sigma):
+        gauss = lambda x: (1 / (sigma * np.sqrt(2 * np.pi))) * np.exp(-0.5 * ((x - mu) / sigma) ** 2)
+        return gauss

    def parse_batch_train(self, batch):
        input = batch["img"]