Source code for languagechange.models.change.widid

from languagechange.models.meaning.clustering import Clustering, APosterioriaffinityPropagation
from languagechange.models.change.timeseries import TimeSeries
import numpy as np
from typing import List, Union


[docs]
class WiDiD:
    """
        A class that implements WiDiD (https://github.com/FrancescoPeriti/WiDiD).
    """
    def __init__(self, algorithm = APosterioriaffinityPropagation, metric = 'cosine', **args):
        self.clustering_parameters = args
        self.algorithm = algorithm
        self.clustering = Clustering(self.algorithm(**self.clustering_parameters))
        self.metric = metric

    

[docs]
    def compute_scores(self, embs_list : List[np.array], timeseries_type='consecutive', k=1, change_metric='apd', time_labels: Union[np.array, List] = None):
        """
            Performs a-posteriori affinity propagation (APP) clustering and computes the semantic change as the APD (or another metric) between the prototype embeddings in clusters of different time periods.
            
            Args: 
                embs_list ([np.array]): a list of embeddings for a target word, where each element is embeddings of one time period.
                timeseries_type (str): the type of timeseries (see usage in languagechange.models.change.timeseries).
                k (int): the window size, if moving average (see usage in languagechange.models.change.timeseries).
                change_metric (str): the change metric (e.g. 'apd') to use (see usage in languagechange.models.change.timeseries).
                change_metric (str): the change metric (e.g. 'apd') to use (see usage in languagechange.models.change.timeseries).
                time_labels (np.array|list): labels for the x axis of the timeseries (see usage in languagechange.models.change.timeseries).

            Returns:
                labels ([np.array]): the labels for each embedding in each time period.
                prot_embs ([np.array]): a list of matrices encoding the prototype (average) embedding of each cluster in each time period.
                change_scores (TimeSeries): a timeseries (languagechange.models.change.timeseries.TimeSeries) containing the degree of change between the embeddings in different time periods.
        """
        self.clustering = Clustering(self.algorithm(**self.clustering_parameters))
        self.clustering.get_cluster_results(embs_list)
        all_labels = self.clustering.labels
        labels = []

        i = 0
        for embs in embs_list:
            labels.append(all_labels[i:i+embs.shape[0]])
            i += embs.shape[0]

        # Compute the centroids of each cluster (the prototype embeddings)
        prot_embs = []
        for i, embs in enumerate(embs_list):
            prot_embs.append(np.array([embs[labels[i] == label].mean(axis=0) for label in np.unique(labels[i])]))

        # Get the change scores between prototype embeddings
        change_scores = TimeSeries(embs=prot_embs, change_metric=change_metric, timeseries_type=timeseries_type, k=k, time_labels=time_labels)

        return labels, prot_embs, change_scores