from numpy.random.mtrand import normal
import torch
import numpy as np
from torch.utils.data import Dataset
from collections import defaultdict as ddict
import random
from .DataPreprocess import *
from IPython import embed
import dgl
import torch.nn.functional as F
import time
import queue
[docs]class UniSampler(BaseSampler):
"""Random negative sampling
Filtering out positive samples and selecting some samples randomly as negative samples.
Attributes:
cross_sampling_flag: The flag of cross sampling head and tail negative samples.
"""
def __init__(self, args):
super().__init__(args)
self.cross_sampling_flag = 0
[docs] def sampling(self, data):
"""Filtering out positive samples and selecting some samples randomly as negative samples.
Args:
data: The triples used to be sampled.
Returns:
batch_data: The training data.
"""
batch_data = {}
neg_ent_sample = []
subsampling_weight = []
self.cross_sampling_flag = 1 - self.cross_sampling_flag
if self.cross_sampling_flag == 0:
batch_data['mode'] = "head-batch"
for h, r, t in data:
neg_head = self.head_batch(h, r, t, self.args.num_neg)
neg_ent_sample.append(neg_head)
if self.args.use_weight:
weight = self.count[(h, r)] + self.count[(t, -r-1)]
subsampling_weight.append(weight)
else:
batch_data['mode'] = "tail-batch"
for h, r, t in data:
neg_tail = self.tail_batch(h, r, t, self.args.num_neg)
neg_ent_sample.append(neg_tail)
if self.args.use_weight:
weight = self.count[(h, r)] + self.count[(t, -r-1)]
subsampling_weight.append(weight)
batch_data["positive_sample"] = torch.LongTensor(np.array(data))
batch_data['negative_sample'] = torch.LongTensor(np.array(neg_ent_sample))
if self.args.use_weight:
batch_data["subsampling_weight"] = torch.sqrt(1/torch.tensor(subsampling_weight))
return batch_data
[docs] def uni_sampling(self, data):
batch_data = {}
neg_head_list = []
neg_tail_list = []
for h, r, t in data:
neg_head = self.head_batch(h, r, t, self.args.num_neg)
neg_head_list.append(neg_head)
neg_tail = self.tail_batch(h, r, t, self.args.num_neg)
neg_tail_list.append(neg_tail)
batch_data["positive_sample"] = torch.LongTensor(np.array(data))
batch_data['negative_head'] = torch.LongTensor(np.arrary(neg_head_list))
batch_data['negative_tail'] = torch.LongTensor(np.arrary(neg_tail_list))
return batch_data
[docs] def get_sampling_keys(self):
return ['positive_sample', 'negative_sample', 'mode']
[docs]class BernSampler(BaseSampler):
"""Using bernoulli distribution to select whether to replace the head entity or tail entity.
Attributes:
lef_mean: Record the mean of head entity
rig_mean: Record the mean of tail entity
"""
def __init__(self, args):
super().__init__(args)
self.lef_mean, self.rig_mean = self.calc_bern()
def __normal_batch(self, h, r, t, neg_size):
"""Generate replace head/tail list according to Bernoulli distribution.
Args:
h: The head of triples.
r: The relation of triples.
t: The tail of triples.
neg_size: The number of negative samples corresponding to each triple
Returns:
numpy.array: replace head list and replace tail list.
"""
neg_size_h = 0
neg_size_t = 0
prob = self.rig_mean[r] / (self.rig_mean[r] + self.lef_mean[r]) if self.args.bern_flag else 0.5
for i in range(neg_size):
if random.random() < prob:
neg_size_h += 1
else:
neg_size_t += 1
neg_list_h = []
neg_cur_size = 0
while neg_cur_size < neg_size_h:
neg_tmp_h = self.corrupt_head(t, r, num_max=(neg_size_h - neg_cur_size) * 2)
neg_list_h.append(neg_tmp_h)
neg_cur_size += len(neg_tmp_h)
if neg_list_h != []:
neg_list_h = np.concatenate(neg_list_h)
neg_list_t = []
neg_cur_size = 0
while neg_cur_size < neg_size_t:
neg_tmp_t = self.corrupt_tail(h, r, num_max=(neg_size_t - neg_cur_size) * 2)
neg_list_t.append(neg_tmp_t)
neg_cur_size += len(neg_tmp_t)
if neg_list_t != []:
neg_list_t = np.concatenate(neg_list_t)
return np.hstack((neg_list_h[:neg_size_h], neg_list_t[:neg_size_t]))
[docs] def sampling(self, data):
"""Using bernoulli distribution to select whether to replace the head entity or tail entity.
Args:
data: The triples used to be sampled.
Returns:
batch_data: The training data.
"""
batch_data = {}
neg_ent_sample = []
batch_data['mode'] = 'bern'
for h, r, t in data:
neg_ent_sample = self.__normal_batch(h, r, t, self.args.num_neg)
batch_data["positive_sample"] = torch.LongTensor(np.array(data))
batch_data['negative_sample'] = torch.LongTensor(np.array(neg_ent_sample))
return batch_data
[docs] def calc_bern(self):
"""Calculating the lef_mean and rig_mean.
Returns:
lef_mean: Record the mean of head entity.
rig_mean: Record the mean of tail entity.
"""
h_of_r = ddict(set)
t_of_r = ddict(set)
freqRel = ddict(float)
lef_mean = ddict(float)
rig_mean = ddict(float)
for h, r, t in self.train_triples:
freqRel[r] += 1.0
h_of_r[r].add(h)
t_of_r[r].add(t)
for r in h_of_r:
lef_mean[r] = freqRel[r] / len(h_of_r[r])
rig_mean[r] = freqRel[r] / len(t_of_r[r])
return lef_mean, rig_mean
[docs] @staticmethod
def sampling_keys():
return ['positive_sample', 'negative_sample', 'mode']
[docs]class AdvSampler(BaseSampler):
"""Self-adversarial negative sampling, in math:
p\left(h_{j}^{\prime}, r, t_{j}^{\prime} \mid\left\{\left(h_{i}, r_{i}, t_{i}\right)\right\}\right)=\frac{\exp \alpha f_{r}\left(\mathbf{h}_{j}^{\prime}, \mathbf{t}_{j}^{\prime}\right)}{\sum_{i} \exp \alpha f_{r}\left(\mathbf{h}_{i}^{\prime}, \mathbf{t}_{i}^{\prime}\right)}
Attributes:
freq_hr: The count of (h, r) pairs.
freq_tr: The count of (t, r) pairs.
"""
def __init__(self, args):
super().__init__(args)
self.freq_hr, self.freq_tr = self.calc_freq()
[docs] def sampling(self, pos_sample):
"""Self-adversarial negative sampling.
Args:
data: The triples used to be sampled.
Returns:
batch_data: The training data.
"""
data = pos_sample.numpy().tolist()
adv_sampling = []
for h, r, t in data:
weight = self.freq_hr[(h, r)] + self.freq_tr[(t, r)]
adv_sampling.append(weight)
adv_sampling = torch.tensor(adv_sampling, dtype=torch.float32).cuda()
adv_sampling = torch.sqrt(1 / adv_sampling)
return adv_sampling
[docs] def calc_freq(self):
"""Calculating the freq_hr and freq_tr.
Returns:
freq_hr: The count of (h, r) pairs.
freq_tr: The count of (t, r) pairs.
"""
freq_hr, freq_tr = {}, {}
for h, r, t in self.train_triples:
if (h, r) not in freq_hr:
freq_hr[(h, r)] = self.args.freq_init
else:
freq_hr[(h, r)] += 1
if (t, r) not in freq_tr:
freq_tr[(t, r)] = self.args.freq_init
else:
freq_tr[(t, r)] += 1
return freq_hr, freq_tr
[docs]class AllSampler(RevSampler):
"""Merging triples which have same head and relation, all false tail entities are taken as negative samples.
"""
def __init__(self, args):
super().__init__(args)
# self.num_rel_without_rev = self.args.num_rel // 2
[docs] def sampling(self, data):
"""Randomly sampling from the merged triples.
Args:
data: The triples used to be sampled.
Returns:
batch_data: The training data.
"""
# sample_id = [] #确定triple里的relation是否是reverse的。reverse为1,不是为0
batch_data = {}
table = torch.zeros(len(data), self.args.num_ent)
for id, (h, r, _) in enumerate(data):
hr_sample = self.hr2t_train[(h, r)]
table[id][hr_sample] = 1
# if r > self.num_rel_without_rev:
# sample_id.append(1)
# else:
# sample_id.append(0)
batch_data["sample"] = torch.LongTensor(np.array(data))
batch_data["label"] = table.float()
# batch_data["sample_id"] = torch.LongTensor(sample_id)
return batch_data
[docs] def sampling_keys(self):
return ["sample", "label"]
[docs]class CrossESampler(BaseSampler):
# TODO:类名还需要商榷下
def __init__(self, args):
super().__init__(args)
self.neg_weight = float(self.args.neg_weight / self.args.num_ent)
[docs] def sampling(self, data):
'''一个样本同时做head/tail prediction'''
batch_data = {}
hr_label = self.init_label(len(data))
tr_label = self.init_label(len(data))
for id, (h, r, t) in enumerate(data):
hr_sample = self.hr2t_train[(h, r)]
hr_label[id][hr_sample] = 1.0
tr_sample = self.rt2h_train[(r, t)]
tr_label[id][tr_sample] = 1.0
batch_data["sample"] = torch.LongTensor(data)
batch_data["hr_label"] = hr_label.float()
batch_data["tr_label"] = tr_label.float()
return batch_data
[docs] def init_label(self, row):
label = torch.rand(row, self.args.num_ent)
label = (label > self.neg_weight).float()
label -= 1.0
return label
[docs] def sampling_keys(self):
return ["sample", "label"]
[docs]class ConvSampler(RevSampler):
"""Merging triples which have same head and relation, all false tail entities are taken as negative samples.
The triples which have same head and relation are treated as one triple.
Attributes:
label: Mask the false tail as negative samples.
triples: The triples used to be sampled.
"""
def __init__(self, args):
self.label = None
self.triples = None
super().__init__(args)
super().get_hr_trian()
[docs] def sampling(self, pos_hr_t):
"""Randomly sampling from the merged triples.
Args:
pos_hr_t: The triples ((head,relation) pairs) used to be sampled.
Returns:
batch_data: The training data.
"""
batch_data = {}
self.label = torch.zeros(self.args.train_bs, self.args.num_ent)
self.triples = torch.LongTensor([hr for hr , _ in pos_hr_t])
for id, hr_sample in enumerate([t for _ ,t in pos_hr_t]):
self.label[id][hr_sample] = 1
batch_data["sample"] = self.triples
batch_data["label"] = self.label
return batch_data
[docs] def sampling_keys(self):
return ["sample", "label"]
[docs]class XTransESampler(RevSampler):
"""Random negative sampling and recording neighbor entities.
Attributes:
triples: The triples used to be sampled.
neg_sample: The negative samples.
h_neighbor: The neighbor of sampled entites.
h_mask: The tag of effecitve neighbor.
max_neighbor: The maximum of the neighbor entities.
"""
def __init__(self, args):
super().__init__(args)
super().get_h2rt_t2hr_from_train()
self.triples = None
self.neg_sample = None
self.h_neighbor = None
self.h_mask = None
self.max_neighbor = 200
[docs] def sampling(self, data):
"""Random negative sampling and recording neighbor entities.
Args:
data: The triples used to be sampled.
Returns:
batch_data: The training data.
"""
batch_data = {}
neg_ent_sample = []
mask = np.zeros([self.args.train_bs, 20000], dtype=float)
h_neighbor = np.zeros([self.args.train_bs, 20000, 2])
for id, triples in enumerate(data):
h,r,t = triples
num_h_neighbor = len(self.h2rt_train[h])
h_neighbor[id][0:num_h_neighbor] = np.array(self.h2rt_train[h])
mask[id][0:num_h_neighbor] = np.ones([num_h_neighbor])
neg_tail = self.tail_batch(h, r, t, self.args.num_neg)
neg_ent_sample.append(neg_tail)
self.triples = data
self.neg_sample = neg_ent_sample
self.h_neighbor = h_neighbor[:, :self.max_neighbor]
self.h_mask = mask[:, :self.max_neighbor]
batch_data["positive_sample"] = torch.LongTensor(self.triples)
batch_data['negative_sample'] = torch.LongTensor(self.neg_sample)
batch_data['neighbor'] = torch.LongTensor(self.h_neighbor)
batch_data['mask'] = torch.LongTensor(self.h_mask)
batch_data['mode'] = "tail-batch"
return batch_data
[docs] def get_sampling_keys(self):
return ['positive_sample', 'negative_sample', 'neighbor', 'mask', 'mode']
[docs]class GraphSampler(RevSampler):
"""Graph based sampling in neural network.
Attributes:
entity: The entities of sampled triples.
relation: The relation of sampled triples.
triples: The sampled triples.
graph: The graph structured sampled triples by dgl.graph in DGL.
norm: The edge norm in graph.
label: Mask the false tail as negative samples.
"""
def __init__(self, args):
super().__init__(args)
self.entity = None
self.relation = None
self.triples = None
self.graph = None
self.norm = None
self.label = None
[docs] def sampling(self, pos_triples):
"""Graph based sampling in neural network.
Args:
pos_triples: The triples used to be sampled.
Returns:
batch_data: The training data.
"""
batch_data = {}
pos_triples = np.array(pos_triples)
pos_triples, self.entity = self.sampling_positive(pos_triples)
head_triples = self.sampling_negative('head', pos_triples, self.args.num_neg)
tail_triples = self.sampling_negative('tail', pos_triples, self.args.num_neg)
self.triples = np.concatenate((pos_triples,head_triples,tail_triples))
batch_data['entity'] = self.entity
batch_data['triples'] = self.triples
self.label = torch.zeros((len(self.triples),1))
self.label[0 : self.args.train_bs] = 1
batch_data['label'] = self.label
split_size = int(self.args.train_bs * 0.5)
graph_split_ids = np.random.choice(
self.args.train_bs,
size=split_size,
replace=False
)
head,rela,tail = pos_triples.transpose()
head = torch.tensor(head[graph_split_ids], dtype=torch.long).contiguous()
rela = torch.tensor(rela[graph_split_ids], dtype=torch.long).contiguous()
tail = torch.tensor(tail[graph_split_ids], dtype=torch.long).contiguous()
self.graph, self.relation, self.norm = self.build_graph(len(self.entity), (head,rela,tail), -1)
batch_data['graph'] = self.graph
batch_data['relation'] = self.relation
batch_data['norm'] = self.norm
return batch_data
[docs] def get_sampling_keys(self):
return ['graph','triples','label','entity','relation','norm']
[docs] def sampling_negative(self, mode, pos_triples, num_neg):
"""Random negative sampling without filtering
Args:
mode: The mode of negtive sampling.
pos_triples: The positive triples.
num_neg: The number of negative samples corresponding to each triple.
Results:
neg_samples: The negative triples.
"""
neg_random = np.random.choice(
len(self.entity),
size = num_neg * len(pos_triples)
)
neg_samples = np.tile(pos_triples, (num_neg, 1))
if mode == 'head':
neg_samples[:,0] = neg_random
elif mode == 'tail':
neg_samples[:,2] = neg_random
return neg_samples
[docs] def build_graph(self, num_ent, triples, power):
"""Using sampled triples to build a graph by dgl.graph in DGL.
Args:
num_ent: The number of entities.
triples: The positive sampled triples.
power: The power index for normalization.
Returns:
rela: The relation of sampled triples.
graph: The graph structured sampled triples by dgl.graph in DGL.
edge_norm: The edge norm in graph.
"""
head, rela, tail = triples[0], triples[1], triples[2]
graph = dgl.graph(([], []))
graph.add_nodes(num_ent)
graph.add_edges(head, tail)
node_norm = self.comp_deg_norm(graph, power)
edge_norm = self.node_norm_to_edge_norm(graph,node_norm)
rela = torch.tensor(rela)
return graph, rela, edge_norm
[docs] def comp_deg_norm(self, graph, power=-1):
"""Calculating the normalization node weight.
Args:
graph: The graph structured sampled triples by dgl.graph in DGL.
power: The power index for normalization.
Returns:
tensor: The node weight of normalization.
"""
graph = graph.local_var()
in_deg = graph.in_degrees(range(graph.number_of_nodes())).float().numpy()
norm = in_deg.__pow__(power)
norm[np.isinf(norm)] = 0
return torch.from_numpy(norm)
[docs] def node_norm_to_edge_norm(slef, graph, node_norm):
"""Calculating the normalization edge weight.
Args:
graph: The graph structured sampled triples by dgl.graph in DGL.
node_norm: The node weight of normalization.
Returns:
tensor: The edge weight of normalization.
"""
graph = graph.local_var()
# convert to edge norm
graph.ndata['norm'] = node_norm.view(-1,1)
graph.apply_edges(lambda edges : {'norm' : edges.dst['norm']})
return graph.edata['norm']
[docs] def sampling_positive(self,positive_triples):
"""Regenerate positive sampling.
Args:
positive_triples: The positive sampled triples.
Results:
The regenerate triples and entities filter invisible entities.
"""
edges = np.random.choice(
np.arange(len(positive_triples)),
size = self.args.train_bs,
replace=False
)
edges = positive_triples[edges]
head, rela, tail = np.array(edges).transpose()
entity, index = np.unique((head, tail), return_inverse=True)
head, tail = np.reshape(index, (2, -1))
return np.stack((head,rela,tail)).transpose(), \
torch.from_numpy(entity).view(-1,1).long()
[docs]class KBATSampler(BaseSampler):
"""Graph based n_hop neighbours in neural network.
Attributes:
n_hop: The graph of n_hop neighbours.
graph: The adjacency graph.
neighbours: The neighbours of sampled triples.
adj_matrix:The triples of sampled.
triples: The sampled triples.
triples_GAT_pos: Positive triples.
triples_GAT_neg: Negative triples.
triples_Con: All triples including positive triples and negative triples.
label: Mask the false tail as negative samples.
"""
def __init__(self, args):
super().__init__(args)
self.n_hop = None
self.graph = None
self.neighbours = None
self.adj_matrix = None
self.entity = None
self.triples_GAT_pos = None
self.triples_GAT_neg = None
self.triples_Con = None
self.label = None
self.get_neighbors()
[docs] def sampling(self, pos_triples):
"""Graph based n_hop neighbours in neural network.
Args:
pos_triples: The triples used to be sampled.
Returns:
batch_data: The training data.
"""
batch_data = {}
#--------------------KBAT-Sampler------------------------------------------
self.entity = self.get_unique_entity(pos_triples)
head_triples = self.sam_negative('head', pos_triples, self.args.num_neg)
tail_triples = self.sam_negative('tail', pos_triples, self.args.num_neg)
self.triples_GAT_neg = torch.tensor(np.concatenate((head_triples, tail_triples)))
batch_data['triples_GAT_pos'] = torch.tensor(pos_triples)
batch_data['triples_GAT_neg'] = self.triples_GAT_neg
head, rela, tail = torch.tensor(self.train_triples).t()
self.adj_matrix = (torch.stack((tail, head)), rela)
batch_data['adj_matrix'] = self.adj_matrix
self.n_hop = self.get_batch_nhop_neighbors_all()
batch_data['n_hop'] = self.n_hop
#--------------------ConvKB-Sampler------------------------------------------
head_triples = self.sampling_negative('head', pos_triples, self.args.num_neg)
tail_triples = self.sampling_negative('tail', pos_triples, self.args.num_neg)
self.triples_Con = np.concatenate((pos_triples, head_triples, tail_triples))
self.label = -torch.ones((len(self.triples_Con),1))
self.label[0 : self.args.train_bs] = 1
batch_data['triples_Con'] = self.triples_Con
batch_data['label'] = self.label
return batch_data
[docs] def get_sampling_keys(self):
return ['adj_matrix', 'n_hop', 'triples_GAT_pos',
'triples_GAT_neg', 'triples_Con' , 'label']
[docs] def bfs(self, graph, source, nbd_size=2):
"""Using depth first search algorithm to generate n_hop neighbor graph.
Args:
graph: The adjacency graph.
source: Head node.
nbd_size: The number of hops.
Returns:
neighbors: N_hop neighbor graph.
"""
visit = {}
distance = {}
parent = {}
distance_lengths = {}
visit[source] = 1
distance[source] = 0
parent[source] = (-1, -1)
q = queue.Queue()
q.put((source, -1))
while(not q.empty()):
top = q.get()
if top[0] in graph.keys():
for target in graph[top[0]].keys():
if(target in visit.keys()):
continue
else:
q.put((target, graph[top[0]][target]))
distance[target] = distance[top[0]] + 1
visit[target] = 1
if distance[target] > 2:
continue
parent[target] = (top[0], graph[top[0]][target]) # 记录父亲节点id和关系id
if distance[target] not in distance_lengths.keys():
distance_lengths[distance[target]] = 1
neighbors = {}
for target in visit.keys():
if(distance[target] != nbd_size):
continue
edges = [-1, parent[target][1]]
relations = []
entities = [target]
temp = target
while(parent[temp] != (-1, -1)):
relations.append(parent[temp][1])
entities.append(parent[temp][0])
temp = parent[temp][0]
if(distance[target] in neighbors.keys()):
neighbors[distance[target]].append(
(tuple(relations), tuple(entities[:-1]))) #删除已知的source 记录前两跳实体及关系
else:
neighbors[distance[target]] = [
(tuple(relations), tuple(entities[:-1]))]
return neighbors
[docs] def get_neighbors(self, nbd_size=2):
"""Getting the relation and entity of the source in the n_hop neighborhood.
Args:
nbd_size: The number of hops.
Returns:
self.neighbours: Record the relation and entity of the source in the n_hop neighborhood.
"""
self.graph = {}
for triple in self.train_triples:
head = triple[0]
rela = triple[1]
tail = triple[2]
if(head not in self.graph.keys()):
self.graph[head] = {}
self.graph[head][tail] = rela
else:
self.graph[head][tail] = rela
neighbors = {}
'''
import pickle
print("Opening node_neighbors pickle object")
file = self.args.data_path + "/2hop.pickle"
with open(file, 'rb') as handle:
self.neighbours = pickle.load(handle)
return
'''
start_time = time.time()
print("Start Graph BFS")
for head in self.graph.keys():
temp_neighbors = self.bfs(self.graph, head, nbd_size)
for distance in temp_neighbors.keys():
if(head in neighbors.keys()):
if(distance in neighbors[head].keys()):
neighbors[head][distance].append(
temp_neighbors[distance])
else:
neighbors[head][distance] = temp_neighbors[distance]
else:
neighbors[head] = {}
neighbors[head][distance] = temp_neighbors[distance]
print("Finish BFS, time taken ", time.time() - start_time)
self.neighbours = neighbors
[docs] def get_unique_entity(self, triples):
"""Getting the set of entity.
Args:
triples: The sampled triples.
Returns:
numpy.array: The set of entity
"""
train_triples = np.array(triples)
train_entities = np.concatenate((train_triples[:,0], train_triples[:,2]))
return np.unique(train_entities)
[docs] def get_batch_nhop_neighbors_all(self, nbd_size=2):
"""Getting n_hop neighbors of all entities in batch.
Args:
nbd_size: The number of hops.
Returns:
The set of n_hop neighbors.
"""
batch_source_triples = []
for source in self.entity:
if source in self.neighbours.keys():
nhop_list = self.neighbours[source][nbd_size]
for i, tup in enumerate(nhop_list):
if(i >= 2):
break
batch_source_triples.append([source,
tup[0][-1],
tup[0][0],
tup[1][0]])
n_hop = np.array(batch_source_triples).astype(np.int32)
return torch.autograd.Variable(torch.LongTensor(n_hop))
[docs] def sampling_negative(self, mode, pos_triples, num_neg):
"""Random negative sampling.
Args:
mode: The mode of negtive sampling.
pos_triples: The positive triples.
num_neg: The number of negative samples corresponding to each triple.
Results:
neg_samples: The negative triples.
"""
neg_samples = np.tile(pos_triples, (num_neg, 1))
if mode == 'head':
neg_head = []
for h, r, t in pos_triples:
neg_head.append(self.head_batch(h, r, t, num_neg))
neg_samples[:,0] = torch.tensor(neg_head).t().reshape(-1)
elif mode == 'tail':
neg_tail = []
for h, r, t in pos_triples:
neg_tail.append(self.tail_batch(h, r, t, num_neg))
neg_samples[:,2] = torch.tensor(neg_tail).t().reshape(-1)
return neg_samples
[docs] def sam_negative(self, mode, pos_triples, num_neg):
"""Random negative sampling without filter.
Args:
mode: The mode of negtive sampling.
pos_triples: The positive triples.
num_neg: The number of negative samples corresponding to each triple.
Results:
neg_samples: The negative triples.
"""
neg_random = np.random.choice(
len(self.entity),
size = num_neg * len(pos_triples)
)
neg_samples = np.tile(pos_triples, (num_neg, 1))
if mode == 'head':
neg_samples[:,0] = neg_random
elif mode == 'tail':
neg_samples[:,2] = neg_random
return neg_samples
[docs]class CompGCNSampler(GraphSampler):
"""Graph based sampling in neural network.
Attributes:
relation: The relation of sampled triples.
triples: The sampled triples.
graph: The graph structured sampled triples by dgl.graph in DGL.
norm: The edge norm in graph.
label: Mask the false tail as negative samples.
"""
def __init__(self, args):
super().__init__(args)
self.relation = None
self.triples = None
self.graph = None
self.norm = None
self.label = None
super().get_hr_trian()
self.graph, self.relation, self.norm = \
self.build_graph(self.args.num_ent, np.array(self.t_triples).transpose(), -0.5)
[docs] def sampling(self, pos_hr_t):
"""Graph based n_hop neighbours in neural network.
Args:
pos_hr_t: The triples(hr, t) used to be sampled.
Returns:
batch_data: The training data.
"""
batch_data = {}
self.label = torch.zeros(self.args.train_bs, self.args.num_ent)
self.triples = torch.LongTensor([hr for hr , _ in pos_hr_t])
for id, hr_sample in enumerate([t for _ ,t in pos_hr_t]):
self.label[id][hr_sample] = 1
batch_data['sample'] = self.triples
batch_data['label'] = self.label
batch_data['graph'] = self.graph
batch_data['relation'] = self.relation
batch_data['norm'] = self.norm
return batch_data
[docs] def get_sampling_keys(self):
return ['sample','label','graph','relation','norm']
[docs] def node_norm_to_edge_norm(self, graph, node_norm):
"""Calculating the normalization edge weight.
Args:
graph: The graph structured sampled triples by dgl.graph in DGL.
node_norm: The node weight of normalization.
Returns:
norm: The edge weight of normalization.
"""
graph.ndata['norm'] = node_norm
graph.apply_edges(lambda edges: {'norm': edges.dst['norm'] * edges.src['norm']})
norm = graph.edata.pop('norm').squeeze()
return norm
[docs]class TestSampler(object):
"""Sampling triples and recording positive triples for testing.
Attributes:
sampler: The function of training sampler.
hr2t_all: Record the tail corresponding to the same head and relation.
rt2h_all: Record the head corresponding to the same tail and relation.
num_ent: The count of entities.
"""
def __init__(self, sampler):
self.sampler = sampler
self.hr2t_all = ddict(set)
self.rt2h_all = ddict(set)
self.get_hr2t_rt2h_from_all()
self.num_ent = sampler.args.num_ent
[docs] def get_hr2t_rt2h_from_all(self):
"""Get the set of hr2t and rt2h from all datasets(train, valid, and test), the data type is tensor.
Update:
self.hr2t_all: The set of hr2t.
self.rt2h_all: The set of rt2h.
"""
self.all_true_triples = self.sampler.get_all_true_triples()
for h, r, t in self.all_true_triples:
self.hr2t_all[(h, r)].add(t)
self.rt2h_all[(r, t)].add(h)
for h, r in self.hr2t_all:
self.hr2t_all[(h, r)] = torch.tensor(list(self.hr2t_all[(h, r)]))
for r, t in self.rt2h_all:
self.rt2h_all[(r, t)] = torch.tensor(list(self.rt2h_all[(r, t)]))
[docs] def sampling(self, data):
"""Sampling triples and recording positive triples for testing.
Args:
data: The triples used to be sampled.
Returns:
batch_data: The data used to be evaluated.
"""
batch_data = {}
head_label = torch.zeros(len(data), self.num_ent)
tail_label = torch.zeros(len(data), self.num_ent)
for idx, triple in enumerate(data):
head, rel, tail = triple
head_label[idx][self.rt2h_all[(rel, tail)]] = 1.0
tail_label[idx][self.hr2t_all[(head, rel)]] = 1.0
batch_data["positive_sample"] = torch.tensor(data)
batch_data["head_label"] = head_label
batch_data["tail_label"] = tail_label
return batch_data
[docs] def get_sampling_keys(self):
return ["positive_sample", "head_label", "tail_label"]
[docs]class GraphTestSampler(object):
"""Sampling graph for testing.
Attributes:
sampler: The function of training sampler.
hr2t_all: Record the tail corresponding to the same head and relation.
rt2h_all: Record the head corresponding to the same tail and relation.
num_ent: The count of entities.
triples: The training triples.
"""
def __init__(self, sampler):
self.sampler = sampler
self.hr2t_all = ddict(set)
self.rt2h_all = ddict(set)
self.get_hr2t_rt2h_from_all()
self.num_ent = sampler.args.num_ent
self.triples = sampler.train_triples
[docs] def get_hr2t_rt2h_from_all(self):
"""Get the set of hr2t and rt2h from all datasets(train, valid, and test), the data type is tensor.
Update:
self.hr2t_all: The set of hr2t.
self.rt2h_all: The set of rt2h.
"""
self.all_true_triples = self.sampler.get_all_true_triples()
for h, r, t in self.all_true_triples:
self.hr2t_all[(h, r)].add(t)
self.rt2h_all[(r, t)].add(h)
for h, r in self.hr2t_all:
self.hr2t_all[(h, r)] = torch.tensor(list(self.hr2t_all[(h, r)]))
for r, t in self.rt2h_all:
self.rt2h_all[(r, t)] = torch.tensor(list(self.rt2h_all[(r, t)]))
[docs] def sampling(self, data):
"""Sampling graph for testing.
Args:
data: The triples used to be sampled.
Returns:
batch_data: The data used to be evaluated.
"""
batch_data = {}
head_label = torch.zeros(len(data), self.num_ent)
tail_label = torch.zeros(len(data), self.num_ent)
for idx, triple in enumerate(data):
# from IPython import embed;embed();exit()
head, rel, tail = triple
head_label[idx][self.rt2h_all[(rel, tail)]] = 1.0
tail_label[idx][self.hr2t_all[(head, rel)]] = 1.0
batch_data["positive_sample"] = torch.tensor(data)
batch_data["head_label"] = head_label
batch_data["tail_label"] = tail_label
head, rela, tail = np.array(self.triples).transpose()
graph, rela, norm = self.sampler.build_graph(self.num_ent, (head, rela, tail), -1)
batch_data["graph"] = graph
batch_data["rela"] = rela
batch_data["norm"] = norm
batch_data["entity"] = torch.arange(0, self.num_ent, dtype=torch.long).view(-1,1)
return batch_data
[docs] def get_sampling_keys(self):
return ["positive_sample", "head_label", "tail_label",\
"graph", "rela", "norm", "entity"]
[docs]class CompGCNTestSampler(object):
"""Sampling graph for testing.
Attributes:
sampler: The function of training sampler.
hr2t_all: Record the tail corresponding to the same head and relation.
rt2h_all: Record the head corresponding to the same tail and relation.
num_ent: The count of entities.
triples: The training triples.
"""
def __init__(self, sampler):
self.sampler = sampler
self.hr2t_all = ddict(set)
self.rt2h_all = ddict(set)
self.get_hr2t_rt2h_from_all()
self.num_ent = sampler.args.num_ent
self.triples = sampler.t_triples
[docs] def get_hr2t_rt2h_from_all(self):
"""Get the set of hr2t and rt2h from all datasets(train, valid, and test), the data type is tensor.
Update:
self.hr2t_all: The set of hr2t.
self.rt2h_all: The set of rt2h.
"""
self.all_true_triples = self.sampler.get_all_true_triples()
for h, r, t in self.all_true_triples:
self.hr2t_all[(h, r)].add(t)
self.rt2h_all[(r, t)].add(h)
for h, r in self.hr2t_all:
self.hr2t_all[(h, r)] = torch.tensor(list(self.hr2t_all[(h, r)]))
for r, t in self.rt2h_all:
self.rt2h_all[(r, t)] = torch.tensor(list(self.rt2h_all[(r, t)]))
[docs] def sampling(self, data):
"""Sampling graph for testing.
Args:
data: The triples used to be sampled.
Returns:
batch_data: The data used to be evaluated.
"""
batch_data = {}
head_label = torch.zeros(len(data), self.num_ent)
tail_label = torch.zeros(len(data), self.num_ent)
for idx, triple in enumerate(data):
# from IPython import embed;embed();exit()
head, rel, tail = triple
head_label[idx][self.rt2h_all[(rel, tail)]] = 1.0
tail_label[idx][self.hr2t_all[(head, rel)]] = 1.0
batch_data["positive_sample"] = torch.tensor(data)
batch_data["head_label"] = head_label
batch_data["tail_label"] = tail_label
graph, relation, norm = \
self.sampler.build_graph(self.num_ent, np.array(self.triples).transpose(), -0.5)
batch_data["graph"] = graph
batch_data["rela"] = relation
batch_data["norm"] = norm
batch_data["entity"] = torch.arange(0, self.num_ent, dtype=torch.long).view(-1,1)
return batch_data
[docs] def get_sampling_keys(self):
return ["positive_sample", "head_label", "tail_label",\
"graph", "rela", "norm", "entity"]
'''继承torch.Dataset'''
[docs]class KGDataset(Dataset):
def __init__(self, triples):
self.triples = triples
def __len__(self):
return len(self.triples)
def __getitem__(self, idx):
return self.triples[idx]