Upload to Main

2025-10-07 22:42:55 +08:00
commit d3ddab7c5d
218 changed files with 125815 additions and 0 deletions
@@ -0,0 +1,4 @@
+from .euclidean import *
+from .cossim import *
+from .submodular_function import *
+from .submodular_optimizer import *
@@ -0,0 +1,35 @@
+import numpy as np
+import torch
+
+
+def cossim_np(v1, v2):
+    # return cossim(torch.tensor(v1),torch.tensor(v2)).cpu().numpy()
+    num = np.dot(v1, v2.T)
+    denom = np.linalg.norm(v1, axis=1).reshape(-1, 1) * np.linalg.norm(v2, axis=1)
+    res = num / (denom + 1e-6)
+    res[np.isneginf(res)] = 0.
+    return 0.5 + 0.5 * res
+
+def cossim_pair_np(v1):
+    num = np.dot(v1, v1.T)
+    norm = np.linalg.norm(v1, axis=1)
+    denom = norm.reshape(-1, 1) * norm
+    res = num / (denom + 1e-6)
+    res[np.isneginf(res)] = 0.
+    return 0.5 + 0.5 * res
+
+def cossim(v1, v2):
+    num = torch.matmul(v1, v2.T)
+    denom = torch.norm(v1, dim=1).view(-1, 1) * torch.norm(v2, dim=1)
+    res = num / (denom + 1e-6)
+    res[torch.isneginf(res)] = 0.
+    return 0.5 + 0.5 * res
+
+def cossim_pair(v1):
+    num = torch.matmul(v1, v1.T)
+    norm = torch.norm(v1, dim=1)
+    denom = norm.view(-1, 1) * norm
+    res = num / (denom + 1e-6)
+    res[torch.isneginf(res)] = 0.
+    return 0.5 + 0.5 * res
+
@@ -0,0 +1,36 @@
+import torch
+import numpy as np
+
+
+def euclidean_dist(x, y):
+    m, n = x.size(0), y.size(0)
+    xx = torch.pow(x, 2).sum(1, keepdim=True).expand(m, n)
+    yy = torch.pow(y, 2).sum(1, keepdim=True).expand(n, m).t()
+    dist = xx + yy
+    dist.addmm_(1, -2, x, y.t())
+    dist = dist.clamp(min=1e-12).sqrt()
+    return dist
+
+
+def euclidean_dist_pair(x):
+    m = x.size(0)
+    xx = torch.pow(x, 2).sum(1, keepdim=True).expand(m, m)
+    dist = xx + xx.t()
+    dist.addmm_(1, -2, x, x.t())
+    dist = dist.clamp(min=1e-12).sqrt()
+    return dist
+
+def euclidean_dist_np(x, y):
+    (rowx, colx) = x.shape
+    (rowy, coly) = y.shape
+    xy = np.dot(x, y.T)
+    x2 = np.repeat(np.reshape(np.sum(np.multiply(x, x), axis=1), (rowx, 1)), repeats=rowy, axis=1)
+    y2 = np.repeat(np.reshape(np.sum(np.multiply(y, y), axis=1), (rowy, 1)), repeats=rowx, axis=1).T
+    return np.sqrt(np.clip(x2 + y2 - 2. * xy, 1e-12, None))
+
+#calculate the euclidean distance of each sample in x, return a N*N matrix, whose diag is zero
+def euclidean_dist_pair_np(x):
+    (rowx, colx) = x.shape
+    xy = np.dot(x, x.T)
+    x2 = np.repeat(np.reshape(np.sum(np.multiply(x, x), axis=1), (rowx, 1)), repeats=rowx, axis=1)
+    return np.sqrt(np.clip(x2 + x2.T - 2. * xy, 1e-12, None))
@@ -0,0 +1,144 @@
+import numpy as np
+
+
+class SubmodularFunction(object):
+    def __init__(self, index, similarity_kernel=None, similarity_matrix=None, already_selected=[]):
+        self.index = index
+        self.n = len(index)
+
+        self.already_selected = already_selected
+
+        assert similarity_kernel is not None or similarity_matrix is not None
+
+        # For the sample similarity matrix, the method supports two input modes, one is to input a pairwise similarity
+        # matrix for the whole sample, and the other case allows the input of a similarity kernel to be used to
+        # calculate similarities incrementally at a later time if required.
+        if similarity_kernel is not None:
+            assert callable(similarity_kernel)
+            self.similarity_kernel = self._similarity_kernel(similarity_kernel)
+        else:
+            assert similarity_matrix.shape[0] == self.n and similarity_matrix.shape[1] == self.n
+            self.similarity_matrix = similarity_matrix
+            self.similarity_kernel = lambda a, b: self.similarity_matrix[np.ix_(a, b)]
+
+    def _similarity_kernel(self, similarity_kernel):
+        return similarity_kernel
+
+
+class FacilityLocation(SubmodularFunction):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        if self.already_selected.__len__()==0:
+            self.cur_max = np.zeros(self.n, dtype=np.float32)
+        else:
+            self.cur_max = np.max(self.similarity_kernel(np.arange(self.n), self.already_selected), axis=1)
+
+        self.all_idx = np.ones(self.n, dtype=bool)
+
+    def _similarity_kernel(self, similarity_kernel):
+        # Initialize a matrix to store similarity values of sample points.
+        self.sim_matrix = np.zeros([self.n, self.n], dtype=np.float32)
+        self.if_columns_calculated = np.zeros(self.n, dtype=bool)
+
+        def _func(a, b):
+            if not np.all(self.if_columns_calculated[b]):
+                if b.dtype != bool:
+                    temp = ~self.all_idx
+                    temp[b] = True
+                    b = temp
+                not_calculated = b & ~self.if_columns_calculated
+                self.sim_matrix[:, not_calculated] = similarity_kernel(self.all_idx, not_calculated)
+                self.if_columns_calculated[not_calculated] = True
+            return self.sim_matrix[np.ix_(a, b)]
+        return _func
+
+    def calc_gain(self, idx_gain, selected, **kwargs):
+        gains = np.maximum(0., self.similarity_kernel(self.all_idx, idx_gain) - self.cur_max.reshape(-1, 1)).sum(axis=0)
+        return gains
+
+    def calc_gain_batch(self, idx_gain, selected, **kwargs):
+        batch_idx = ~self.all_idx
+        batch_idx[0:kwargs["batch"]] = True
+        gains = np.maximum(0., self.similarity_kernel(batch_idx, idx_gain) - self.cur_max[batch_idx].reshape(-1, 1)).sum(axis=0)
+        for i in range(kwargs["batch"], self.n, kwargs["batch"]):
+            batch_idx = ~self.all_idx
+            batch_idx[i * kwargs["batch"]:(i + 1) * kwargs["batch"]] = True
+            gains += np.maximum(0., self.similarity_kernel(batch_idx, idx_gain) - self.cur_max[batch_idx].reshape(-1,1)).sum(axis=0)
+        return gains
+
+    def update_state(self, new_selection, total_selected, **kwargs):
+        self.cur_max = np.maximum(self.cur_max, np.max(self.similarity_kernel(self.all_idx, new_selection), axis=1))
+        #self.cur_max = np.max(np.append(self.cur_max.reshape(-1, 1), self.similarity_kernel(self.all_idx, new_selection), axis=1), axis=1)
+
+
+class GraphCut(SubmodularFunction):
+    def __init__(self, lam: float = 1., **kwargs):
+        super().__init__(**kwargs)
+        self.lam = lam
+
+        if 'similarity_matrix' in kwargs:
+            self.sim_matrix_cols_sum = np.sum(self.similarity_matrix, axis=0)
+        self.all_idx = np.ones(self.n, dtype=bool)
+
+    def _similarity_kernel(self, similarity_kernel):
+        # Initialize a matrix to store similarity values of sample points.
+        self.sim_matrix = np.zeros([self.n, self.n], dtype=np.float32)
+        self.sim_matrix_cols_sum = np.zeros(self.n, dtype=np.float32)
+        self.if_columns_calculated = np.zeros(self.n, dtype=bool)
+
+        def _func(a, b):
+            if not np.all(self.if_columns_calculated[b]):
+                if b.dtype != bool:
+                    temp = ~self.all_idx
+                    temp[b] = True
+                    b = temp
+                not_calculated = b & ~self.if_columns_calculated
+                self.sim_matrix[:, not_calculated] = similarity_kernel(self.all_idx, not_calculated)
+                self.sim_matrix_cols_sum[not_calculated] = np.sum(self.sim_matrix[:, not_calculated], axis=0)
+                self.if_columns_calculated[not_calculated] = True
+            return self.sim_matrix[np.ix_(a, b)]
+        return _func
+
+    def calc_gain(self, idx_gain, selected, **kwargs):
+        # Conditional gain
+        # return the sum distance of each unselected sample to the any other one  (selected, idx_gain) is for fun. _func()
+        gain = -2. * np.sum(self.similarity_kernel(selected, idx_gain), axis=0) + self.lam * self.sim_matrix_cols_sum[idx_gain]
+
+        return gain
+
+    def update_state(self, new_selection, total_selected, **kwargs):
+        pass
+
+
+class LogDeterminant(SubmodularFunction):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.all_idx = np.ones(self.n, dtype=bool)
+
+    def _similarity_kernel(self, similarity_kernel):
+        # Initialize a matrix to store similarity values of sample points.
+        self.sim_matrix = np.zeros([self.n, self.n], dtype=np.float32)
+        self.if_columns_calculated = np.zeros(self.n, dtype=bool)
+
+        def _func(a, b):
+            if not np.all(self.if_columns_calculated[b]):
+                if b.dtype != bool:
+                    temp = ~self.all_idx
+                    temp[b] = True
+                    b = temp
+                not_calculated = b & ~self.if_columns_calculated
+                self.sim_matrix[:, not_calculated] = similarity_kernel(self.all_idx, not_calculated)
+                self.if_columns_calculated[not_calculated] = True
+            return self.sim_matrix[np.ix_(a, b)]
+        return _func
+
+    def calc_gain(self, idx_gain, selected, **kwargs):
+        # Gain for LogDeterminant can be written as $f(x | A ) = \log\det(S_{a} - S_{a,A}S_{A}^{-1}S_{x,A}^T)$.
+        sim_idx_gain = self.similarity_kernel(selected, idx_gain).T
+        sim_selected = self.similarity_kernel(selected, selected)
+        return (np.dot(sim_idx_gain, np.linalg.pinv(sim_selected)) * sim_idx_gain).sum(-1)
+
+    def update_state(self, new_selection, total_selected, **kwargs):
+        pass
@@ -0,0 +1,155 @@
+import numpy as np
+from tqdm import tqdm
+
+optimizer_choices = ["NaiveGreedy", "LazyGreedy", "StochasticGreedy", "ApproximateLazyGreedy"]
+
+class optimizer(object):
+    def __init__(self, args, index, budget:int, already_selected=[]):
+        self.args = args
+        self.index = index
+
+        if budget <= 0 or budget > index.__len__():
+            raise ValueError("Illegal budget for optimizer.")
+
+        self.n = len(index)
+        self.budget = budget
+        self.already_selected = already_selected
+
+
+class NaiveGreedy(optimizer):
+    def __init__(self, args, index, budget:int, already_selected=[]):
+        super(NaiveGreedy, self).__init__(args, index, budget, already_selected)
+
+    def select(self, gain_function, update_state=None, **kwargs):
+        assert callable(gain_function)
+        if update_state is not None:
+            assert callable(update_state)
+        selected = np.zeros(self.n, dtype=bool)
+        selected[self.already_selected] = True
+
+        greedy_gain = np.zeros(len(self.index))
+        for i in range(sum(selected), self.budget):
+            if i % self.args.TRAIN.PRINT_FREQ == 0:
+                print("| Selecting [%3d/%3d]" % (i + 1, self.budget))
+            greedy_gain[~selected] = gain_function(~selected, selected, **kwargs)
+            current_selection = greedy_gain.argmax()
+            selected[current_selection] = True
+            greedy_gain[current_selection] = -np.inf
+            if update_state is not None:
+                update_state(np.array([current_selection]), selected, **kwargs)
+        return self.index[selected]
+
+
+class LazyGreedy(optimizer):
+    def __init__(self, args, index, budget:int, already_selected=[]):
+        super(LazyGreedy, self).__init__(args, index, budget, already_selected)
+
+    def select(self, gain_function, update_state=None, **kwargs):
+        assert callable(gain_function)
+        if update_state is not None:
+            assert callable(update_state)
+        selected = np.zeros(self.n, dtype=bool)
+        selected[self.already_selected] = True
+
+        greedy_gain = np.zeros(len(self.index))
+        greedy_gain[~selected] = gain_function(~selected, selected, **kwargs)
+        greedy_gain[selected] = -np.inf
+
+        for i in tqdm(range(sum(selected), self.budget)):
+            if i % self.args.TRAIN.PRINT_FREQ == 0:
+                print("| Selecting [%3d/%3d]" % (i + 1, self.budget))
+            best_gain = -np.inf
+            last_max_element = -1
+            while True:
+                cur_max_element = greedy_gain.argmax()
+                if last_max_element == cur_max_element:
+                    # Select cur_max_element into the current subset
+                    selected[cur_max_element] = True
+                    greedy_gain[cur_max_element] = -np.inf
+
+                    if update_state is not None:
+                        update_state(np.array([cur_max_element]), selected, **kwargs)
+                    break
+                new_gain = gain_function(np.array([cur_max_element]), selected, **kwargs)[0]
+                greedy_gain[cur_max_element] = new_gain
+                if new_gain >= best_gain:
+                    best_gain = new_gain
+                    last_max_element = cur_max_element
+        return self.index[selected]
+
+
+class StochasticGreedy(optimizer):
+    def __init__(self, args, index, budget:int, already_selected=[], epsilon: float=0.9):
+        super(StochasticGreedy, self).__init__(args, index, budget, already_selected)
+        self.epsilon = epsilon
+
+    def select(self, gain_function, update_state=None, **kwargs):
+        assert callable(gain_function)
+        if update_state is not None:
+            assert callable(update_state)
+        selected = np.zeros(self.n, dtype=bool)
+        selected[self.already_selected] = True
+
+        sample_size = max(round(-np.log(self.epsilon) * self.n / self.budget), 1)
+
+        greedy_gain = np.zeros(len(self.index))
+        all_idx = np.arange(self.n)
+        for i in range(sum(selected), self.budget):
+            if i % self.args.TRAIN.PRINT_FREQ == 0:
+                print("| Selecting [%3d/%3d]" % (i + 1, self.budget))
+
+            # Uniformly select a subset from unselected samples with size sample_size
+            subset = np.random.choice(all_idx[~selected], replace=False, size=min(sample_size, self.n - i))
+
+            if subset.__len__() == 0:
+                break
+
+            greedy_gain[subset] = gain_function(subset, selected, **kwargs)
+            current_selection = greedy_gain[subset].argmax()
+            selected[subset[current_selection]] = True
+            greedy_gain[subset[current_selection]] = -np.inf
+            if update_state is not None:
+                update_state(np.array([subset[current_selection]]), selected, **kwargs)
+        return self.index[selected]
+
+
+class ApproximateLazyGreedy(optimizer):
+    def __init__(self, args, index, budget:int, already_selected=[], beta: float=0.9):
+        super(ApproximateLazyGreedy, self).__init__(args, index, budget, already_selected)
+        self.beta = beta
+
+    def select(self, gain_function, update_state=None, **kwargs):
+        assert callable(gain_function)
+        if update_state is not None:
+            assert callable(update_state)
+        selected = np.zeros(self.n, dtype=bool)
+        selected[self.already_selected] = True
+
+        greedy_gain = np.zeros(len(self.index))
+        greedy_gain[~selected] = gain_function(~selected, selected, **kwargs)
+        greedy_gain[selected] = -np.inf
+
+        for i in range(sum(selected), self.budget):
+            if i % self.args.TRAIN.PRINT_FREQ == 0:
+                print("| Selecting [%3d/%3d]" % (i + 1, self.budget))
+            while True:
+                cur_max_element = greedy_gain.argmax()
+                max_gain = greedy_gain[cur_max_element]
+
+                new_gain = gain_function(np.array([cur_max_element]), selected, **kwargs)[0]
+
+                if new_gain >= self.beta * max_gain:
+                    # Select cur_max_element into the current subset
+                    selected[cur_max_element] = True
+                    greedy_gain[cur_max_element] = -np.inf
+
+                    if update_state is not None:
+                        update_state(np.array([cur_max_element]), selected, **kwargs)
+                    break
+                else:
+                    greedy_gain[cur_max_element] = new_gain
+        return self.index[selected]
+
+
+
+