Source code for languagechange.usages
"""Target usage helpers and containers for LanguageChange."""
import enum
import pickle
import logging
import os
import re
import jsonlines
from pathlib import Path
from languagechange.utils import Time
[docs]
class POS(enum.Enum):
"""Enumeration of supported parts of speech for targets."""
NOUN = 1
VERB = 2
ADJECTIVE = 3
ADVERB = 4
[docs]
class Target:
"""Stores a target word together with optional metadata."""
def __init__(self, target : str):
self.target = target
[docs]
def set_lemma(self, lemma: str):
self.lemma = lemma
[docs]
def set_pos(self, pos:POS):
self.pos = pos
def __str__(self):
return self.target
def __hash__(self):
return hash(self.target)
[docs]
class TargetUsage:
"""Represents an individual usage with offsets and optional time metadata."""
def __init__(self, text: str, offsets: str, time: Time = None, **kwargs):
self.text_ = text
self.offsets = offsets
self.time = time
self.__dict__.update(kwargs)
[docs]
def text(self):
return self.text_
[docs]
def start(self):
return self.offsets[0]
[docs]
def end(self):
return self.offsets[1]
[docs]
def time(self):
return self.time
[docs]
def to_dict(self):
d = self.__dict__
d['time'] = str(d['time'])
return d
def __getitem__(self, item):
return self.text_[item]
def __str__(self):
return self.text_
[docs]
class DWUGUsage(TargetUsage):
"""DWUG-specific usage metadata, including annotator judgments."""
def __init__(self, target, date, grouping, identifier, description, **args):
super().__init__(**args)
self.target = target
self.date = date
self.grouping = grouping
self.identifier = identifier
self.description = description
[docs]
class TargetUsageList(list):
"""List of TargetUsage instances with serialization helpers."""
[docs]
def save(self, path, target):
Path(path).mkdir(parents=True, exist_ok=True)
with open(os.path.join(path,target), 'wb+') as f:
pickle.dump(self,f)
[docs]
def load(path, target):
with open(os.path.join(path,target),'rb') as f:
return pickle.load(f)
[docs]
def time_axis(self):
return [usage.time for usage in self]
[docs]
def to_dict(self):
return [tu.to_dict() for tu in self]
[docs]
class UsageDictionary(dict):
"""Dictionary mapping words to TargetUsageList instances."""
[docs]
def save(self, path, words = {}):
Path(path).mkdir(parents=True, exist_ok=True)
if words == {}:
words = set(self.keys())
else:
words = set(words)
words_not_present = words.difference(set(self.keys()))
if len(words_not_present) != 0:
logging.info(f'Words {words_not_present} are not in the usage dictionary')
for k in set(self.keys()).intersection(words):
output_fn = f"{path}/{k}_usages.jsonl"
with jsonlines.open(output_fn, 'w') as writer:
tul = self[k].to_dict()
for i, tu in enumerate(tul):
tul[i] = {'text': tu['text_']} | tu # replace the 'text_' key with a 'text' key
tul[i].pop('text_')
writer.write_all(tul)
logging.info(f"Usages written to {output_fn}")
[docs]
def load(self, path, words = set()):
if not os.path.exists(path):
logging.error(f'Path {path} does not exist.')
return None
self.clear()
words = set(words)
for fn in os.listdir(path):
match = re.findall(r'(.*)_usages\.jsonl', fn)
if len(match) != 0:
key = match[0]
if key in words or len(words) == 0:
with jsonlines.open(os.path.join(path, fn), 'r') as reader:
self[key] = TargetUsageList(TargetUsage(**tu) for tu in reader)
logging.info(f"Loaded usages from {os.path.join(path, fn)}")
not_loaded_words = words.difference(set(self.keys()))
if len(not_loaded_words) != 0:
logging.info(f"Could not find usages for words {not_loaded_words}")