Source code for languagechange.models.change.widid
from languagechange.models.meaning.clustering import Clustering, APosterioriaffinityPropagation
from languagechange.models.change.timeseries import TimeSeries
import numpy as np
from typing import List, Union
[docs]
class WiDiD:
"""
A class that implements WiDiD (https://github.com/FrancescoPeriti/WiDiD).
"""
def __init__(self, algorithm = APosterioriaffinityPropagation, metric = 'cosine', **args):
self.clustering_parameters = args
self.algorithm = algorithm
self.clustering = Clustering(self.algorithm(**self.clustering_parameters))
self.metric = metric
[docs]
def compute_scores(self, embs_list : List[np.array], timeseries_type='consecutive', k=1, change_metric='apd', time_labels: Union[np.array, List] = None):
"""
Performs a-posteriori affinity propagation (APP) clustering and computes the semantic change as the APD (or another metric) between the prototype embeddings in clusters of different time periods.
Args:
embs_list ([np.array]): a list of embeddings for a target word, where each element is embeddings of one time period.
timeseries_type (str): the type of timeseries (see usage in languagechange.models.change.timeseries).
k (int): the window size, if moving average (see usage in languagechange.models.change.timeseries).
change_metric (str): the change metric (e.g. 'apd') to use (see usage in languagechange.models.change.timeseries).
change_metric (str): the change metric (e.g. 'apd') to use (see usage in languagechange.models.change.timeseries).
time_labels (np.array|list): labels for the x axis of the timeseries (see usage in languagechange.models.change.timeseries).
Returns:
labels ([np.array]): the labels for each embedding in each time period.
prot_embs ([np.array]): a list of matrices encoding the prototype (average) embedding of each cluster in each time period.
change_scores (TimeSeries): a timeseries (languagechange.models.change.timeseries.TimeSeries) containing the degree of change between the embeddings in different time periods.
"""
self.clustering = Clustering(self.algorithm(**self.clustering_parameters))
self.clustering.get_cluster_results(embs_list)
all_labels = self.clustering.labels
labels = []
i = 0
for embs in embs_list:
labels.append(all_labels[i:i+embs.shape[0]])
i += embs.shape[0]
# Compute the centroids of each cluster (the prototype embeddings)
prot_embs = []
for i, embs in enumerate(embs_list):
prot_embs.append(np.array([embs[labels[i] == label].mean(axis=0) for label in np.unique(labels[i])]))
# Get the change scores between prototype embeddings
change_scores = TimeSeries(embs=prot_embs, change_metric=change_metric, timeseries_type=timeseries_type, k=k, time_labels=time_labels)
return labels, prot_embs, change_scores