Upload to Main

This commit is contained in:
张菲
2025-10-07 22:42:55 +08:00
commit d3ddab7c5d
218 changed files with 125815 additions and 0 deletions

View File

@@ -0,0 +1,4 @@
from .euclidean import *
from .cossim import *
from .submodular_function import *
from .submodular_optimizer import *

View File

@@ -0,0 +1,35 @@
import numpy as np
import torch
def cossim_np(v1, v2):
# return cossim(torch.tensor(v1),torch.tensor(v2)).cpu().numpy()
num = np.dot(v1, v2.T)
denom = np.linalg.norm(v1, axis=1).reshape(-1, 1) * np.linalg.norm(v2, axis=1)
res = num / (denom + 1e-6)
res[np.isneginf(res)] = 0.
return 0.5 + 0.5 * res
def cossim_pair_np(v1):
num = np.dot(v1, v1.T)
norm = np.linalg.norm(v1, axis=1)
denom = norm.reshape(-1, 1) * norm
res = num / (denom + 1e-6)
res[np.isneginf(res)] = 0.
return 0.5 + 0.5 * res
def cossim(v1, v2):
num = torch.matmul(v1, v2.T)
denom = torch.norm(v1, dim=1).view(-1, 1) * torch.norm(v2, dim=1)
res = num / (denom + 1e-6)
res[torch.isneginf(res)] = 0.
return 0.5 + 0.5 * res
def cossim_pair(v1):
num = torch.matmul(v1, v1.T)
norm = torch.norm(v1, dim=1)
denom = norm.view(-1, 1) * norm
res = num / (denom + 1e-6)
res[torch.isneginf(res)] = 0.
return 0.5 + 0.5 * res

View File

@@ -0,0 +1,36 @@
import torch
import numpy as np
def euclidean_dist(x, y):
m, n = x.size(0), y.size(0)
xx = torch.pow(x, 2).sum(1, keepdim=True).expand(m, n)
yy = torch.pow(y, 2).sum(1, keepdim=True).expand(n, m).t()
dist = xx + yy
dist.addmm_(1, -2, x, y.t())
dist = dist.clamp(min=1e-12).sqrt()
return dist
def euclidean_dist_pair(x):
m = x.size(0)
xx = torch.pow(x, 2).sum(1, keepdim=True).expand(m, m)
dist = xx + xx.t()
dist.addmm_(1, -2, x, x.t())
dist = dist.clamp(min=1e-12).sqrt()
return dist
def euclidean_dist_np(x, y):
(rowx, colx) = x.shape
(rowy, coly) = y.shape
xy = np.dot(x, y.T)
x2 = np.repeat(np.reshape(np.sum(np.multiply(x, x), axis=1), (rowx, 1)), repeats=rowy, axis=1)
y2 = np.repeat(np.reshape(np.sum(np.multiply(y, y), axis=1), (rowy, 1)), repeats=rowx, axis=1).T
return np.sqrt(np.clip(x2 + y2 - 2. * xy, 1e-12, None))
#calculate the euclidean distance of each sample in x, return a N*N matrix, whose diag is zero
def euclidean_dist_pair_np(x):
(rowx, colx) = x.shape
xy = np.dot(x, x.T)
x2 = np.repeat(np.reshape(np.sum(np.multiply(x, x), axis=1), (rowx, 1)), repeats=rowx, axis=1)
return np.sqrt(np.clip(x2 + x2.T - 2. * xy, 1e-12, None))

View File

@@ -0,0 +1,144 @@
import numpy as np
class SubmodularFunction(object):
def __init__(self, index, similarity_kernel=None, similarity_matrix=None, already_selected=[]):
self.index = index
self.n = len(index)
self.already_selected = already_selected
assert similarity_kernel is not None or similarity_matrix is not None
# For the sample similarity matrix, the method supports two input modes, one is to input a pairwise similarity
# matrix for the whole sample, and the other case allows the input of a similarity kernel to be used to
# calculate similarities incrementally at a later time if required.
if similarity_kernel is not None:
assert callable(similarity_kernel)
self.similarity_kernel = self._similarity_kernel(similarity_kernel)
else:
assert similarity_matrix.shape[0] == self.n and similarity_matrix.shape[1] == self.n
self.similarity_matrix = similarity_matrix
self.similarity_kernel = lambda a, b: self.similarity_matrix[np.ix_(a, b)]
def _similarity_kernel(self, similarity_kernel):
return similarity_kernel
class FacilityLocation(SubmodularFunction):
def __init__(self, **kwargs):
super().__init__(**kwargs)
if self.already_selected.__len__()==0:
self.cur_max = np.zeros(self.n, dtype=np.float32)
else:
self.cur_max = np.max(self.similarity_kernel(np.arange(self.n), self.already_selected), axis=1)
self.all_idx = np.ones(self.n, dtype=bool)
def _similarity_kernel(self, similarity_kernel):
# Initialize a matrix to store similarity values of sample points.
self.sim_matrix = np.zeros([self.n, self.n], dtype=np.float32)
self.if_columns_calculated = np.zeros(self.n, dtype=bool)
def _func(a, b):
if not np.all(self.if_columns_calculated[b]):
if b.dtype != bool:
temp = ~self.all_idx
temp[b] = True
b = temp
not_calculated = b & ~self.if_columns_calculated
self.sim_matrix[:, not_calculated] = similarity_kernel(self.all_idx, not_calculated)
self.if_columns_calculated[not_calculated] = True
return self.sim_matrix[np.ix_(a, b)]
return _func
def calc_gain(self, idx_gain, selected, **kwargs):
gains = np.maximum(0., self.similarity_kernel(self.all_idx, idx_gain) - self.cur_max.reshape(-1, 1)).sum(axis=0)
return gains
def calc_gain_batch(self, idx_gain, selected, **kwargs):
batch_idx = ~self.all_idx
batch_idx[0:kwargs["batch"]] = True
gains = np.maximum(0., self.similarity_kernel(batch_idx, idx_gain) - self.cur_max[batch_idx].reshape(-1, 1)).sum(axis=0)
for i in range(kwargs["batch"], self.n, kwargs["batch"]):
batch_idx = ~self.all_idx
batch_idx[i * kwargs["batch"]:(i + 1) * kwargs["batch"]] = True
gains += np.maximum(0., self.similarity_kernel(batch_idx, idx_gain) - self.cur_max[batch_idx].reshape(-1,1)).sum(axis=0)
return gains
def update_state(self, new_selection, total_selected, **kwargs):
self.cur_max = np.maximum(self.cur_max, np.max(self.similarity_kernel(self.all_idx, new_selection), axis=1))
#self.cur_max = np.max(np.append(self.cur_max.reshape(-1, 1), self.similarity_kernel(self.all_idx, new_selection), axis=1), axis=1)
class GraphCut(SubmodularFunction):
def __init__(self, lam: float = 1., **kwargs):
super().__init__(**kwargs)
self.lam = lam
if 'similarity_matrix' in kwargs:
self.sim_matrix_cols_sum = np.sum(self.similarity_matrix, axis=0)
self.all_idx = np.ones(self.n, dtype=bool)
def _similarity_kernel(self, similarity_kernel):
# Initialize a matrix to store similarity values of sample points.
self.sim_matrix = np.zeros([self.n, self.n], dtype=np.float32)
self.sim_matrix_cols_sum = np.zeros(self.n, dtype=np.float32)
self.if_columns_calculated = np.zeros(self.n, dtype=bool)
def _func(a, b):
if not np.all(self.if_columns_calculated[b]):
if b.dtype != bool:
temp = ~self.all_idx
temp[b] = True
b = temp
not_calculated = b & ~self.if_columns_calculated
self.sim_matrix[:, not_calculated] = similarity_kernel(self.all_idx, not_calculated)
self.sim_matrix_cols_sum[not_calculated] = np.sum(self.sim_matrix[:, not_calculated], axis=0)
self.if_columns_calculated[not_calculated] = True
return self.sim_matrix[np.ix_(a, b)]
return _func
def calc_gain(self, idx_gain, selected, **kwargs):
# Conditional gain
# return the sum distance of each unselected sample to the any other one (selected, idx_gain) is for fun. _func()
gain = -2. * np.sum(self.similarity_kernel(selected, idx_gain), axis=0) + self.lam * self.sim_matrix_cols_sum[idx_gain]
return gain
def update_state(self, new_selection, total_selected, **kwargs):
pass
class LogDeterminant(SubmodularFunction):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.all_idx = np.ones(self.n, dtype=bool)
def _similarity_kernel(self, similarity_kernel):
# Initialize a matrix to store similarity values of sample points.
self.sim_matrix = np.zeros([self.n, self.n], dtype=np.float32)
self.if_columns_calculated = np.zeros(self.n, dtype=bool)
def _func(a, b):
if not np.all(self.if_columns_calculated[b]):
if b.dtype != bool:
temp = ~self.all_idx
temp[b] = True
b = temp
not_calculated = b & ~self.if_columns_calculated
self.sim_matrix[:, not_calculated] = similarity_kernel(self.all_idx, not_calculated)
self.if_columns_calculated[not_calculated] = True
return self.sim_matrix[np.ix_(a, b)]
return _func
def calc_gain(self, idx_gain, selected, **kwargs):
# Gain for LogDeterminant can be written as $f(x | A ) = \log\det(S_{a} - S_{a,A}S_{A}^{-1}S_{x,A}^T)$.
sim_idx_gain = self.similarity_kernel(selected, idx_gain).T
sim_selected = self.similarity_kernel(selected, selected)
return (np.dot(sim_idx_gain, np.linalg.pinv(sim_selected)) * sim_idx_gain).sum(-1)
def update_state(self, new_selection, total_selected, **kwargs):
pass

View File

@@ -0,0 +1,155 @@
import numpy as np
from tqdm import tqdm
optimizer_choices = ["NaiveGreedy", "LazyGreedy", "StochasticGreedy", "ApproximateLazyGreedy"]
class optimizer(object):
def __init__(self, args, index, budget:int, already_selected=[]):
self.args = args
self.index = index
if budget <= 0 or budget > index.__len__():
raise ValueError("Illegal budget for optimizer.")
self.n = len(index)
self.budget = budget
self.already_selected = already_selected
class NaiveGreedy(optimizer):
def __init__(self, args, index, budget:int, already_selected=[]):
super(NaiveGreedy, self).__init__(args, index, budget, already_selected)
def select(self, gain_function, update_state=None, **kwargs):
assert callable(gain_function)
if update_state is not None:
assert callable(update_state)
selected = np.zeros(self.n, dtype=bool)
selected[self.already_selected] = True
greedy_gain = np.zeros(len(self.index))
for i in range(sum(selected), self.budget):
if i % self.args.TRAIN.PRINT_FREQ == 0:
print("| Selecting [%3d/%3d]" % (i + 1, self.budget))
greedy_gain[~selected] = gain_function(~selected, selected, **kwargs)
current_selection = greedy_gain.argmax()
selected[current_selection] = True
greedy_gain[current_selection] = -np.inf
if update_state is not None:
update_state(np.array([current_selection]), selected, **kwargs)
return self.index[selected]
class LazyGreedy(optimizer):
def __init__(self, args, index, budget:int, already_selected=[]):
super(LazyGreedy, self).__init__(args, index, budget, already_selected)
def select(self, gain_function, update_state=None, **kwargs):
assert callable(gain_function)
if update_state is not None:
assert callable(update_state)
selected = np.zeros(self.n, dtype=bool)
selected[self.already_selected] = True
greedy_gain = np.zeros(len(self.index))
greedy_gain[~selected] = gain_function(~selected, selected, **kwargs)
greedy_gain[selected] = -np.inf
for i in tqdm(range(sum(selected), self.budget)):
if i % self.args.TRAIN.PRINT_FREQ == 0:
print("| Selecting [%3d/%3d]" % (i + 1, self.budget))
best_gain = -np.inf
last_max_element = -1
while True:
cur_max_element = greedy_gain.argmax()
if last_max_element == cur_max_element:
# Select cur_max_element into the current subset
selected[cur_max_element] = True
greedy_gain[cur_max_element] = -np.inf
if update_state is not None:
update_state(np.array([cur_max_element]), selected, **kwargs)
break
new_gain = gain_function(np.array([cur_max_element]), selected, **kwargs)[0]
greedy_gain[cur_max_element] = new_gain
if new_gain >= best_gain:
best_gain = new_gain
last_max_element = cur_max_element
return self.index[selected]
class StochasticGreedy(optimizer):
def __init__(self, args, index, budget:int, already_selected=[], epsilon: float=0.9):
super(StochasticGreedy, self).__init__(args, index, budget, already_selected)
self.epsilon = epsilon
def select(self, gain_function, update_state=None, **kwargs):
assert callable(gain_function)
if update_state is not None:
assert callable(update_state)
selected = np.zeros(self.n, dtype=bool)
selected[self.already_selected] = True
sample_size = max(round(-np.log(self.epsilon) * self.n / self.budget), 1)
greedy_gain = np.zeros(len(self.index))
all_idx = np.arange(self.n)
for i in range(sum(selected), self.budget):
if i % self.args.TRAIN.PRINT_FREQ == 0:
print("| Selecting [%3d/%3d]" % (i + 1, self.budget))
# Uniformly select a subset from unselected samples with size sample_size
subset = np.random.choice(all_idx[~selected], replace=False, size=min(sample_size, self.n - i))
if subset.__len__() == 0:
break
greedy_gain[subset] = gain_function(subset, selected, **kwargs)
current_selection = greedy_gain[subset].argmax()
selected[subset[current_selection]] = True
greedy_gain[subset[current_selection]] = -np.inf
if update_state is not None:
update_state(np.array([subset[current_selection]]), selected, **kwargs)
return self.index[selected]
class ApproximateLazyGreedy(optimizer):
def __init__(self, args, index, budget:int, already_selected=[], beta: float=0.9):
super(ApproximateLazyGreedy, self).__init__(args, index, budget, already_selected)
self.beta = beta
def select(self, gain_function, update_state=None, **kwargs):
assert callable(gain_function)
if update_state is not None:
assert callable(update_state)
selected = np.zeros(self.n, dtype=bool)
selected[self.already_selected] = True
greedy_gain = np.zeros(len(self.index))
greedy_gain[~selected] = gain_function(~selected, selected, **kwargs)
greedy_gain[selected] = -np.inf
for i in range(sum(selected), self.budget):
if i % self.args.TRAIN.PRINT_FREQ == 0:
print("| Selecting [%3d/%3d]" % (i + 1, self.budget))
while True:
cur_max_element = greedy_gain.argmax()
max_gain = greedy_gain[cur_max_element]
new_gain = gain_function(np.array([cur_max_element]), selected, **kwargs)[0]
if new_gain >= self.beta * max_gain:
# Select cur_max_element into the current subset
selected[cur_max_element] = True
greedy_gain[cur_max_element] = -np.inf
if update_state is not None:
update_state(np.array([cur_max_element]), selected, **kwargs)
break
else:
greedy_gain[cur_max_element] = new_gain
return self.index[selected]