Source code for languagechange.models.representation.prompting

from typing import List, Union
from languagechange.usages import TargetUsage
import getpass
import os
from langchain_core.prompts import ChatPromptTemplate
from langchain.chat_models import init_chat_model
from pydantic import BaseModel, Field
import logging
import trankit


[docs] class SCFloat(BaseModel): change : float = Field(description='The semantic change on a scale from 0 to 1.',le=1, ge=0)
[docs] class SCDURel(BaseModel): change : int = Field(description='The semantic similary from 1 to 4, where 1 is unrelated, 2 is distantly related, 3 is closely related and 4 is identical.',le=4, ge=1)
[docs] class PromptModel: def __init__(self, model_name : str, model_provider : str, langsmith_key : str = None, provider_key_name : str = None, provider_key : str = None, structure:Union[str,BaseModel]="float", language : str = None, **kwargs): self.model_name = model_name self.language = language os.environ["LANGSMITH_TRACING"] = "true" # The keys can either be passed as arguments, stored as an environment variable or put in manually if langsmith_key != None: os.environ["LANGSMITH_API_KEY"] = langsmith_key elif not os.environ.get("LANGSMITH_API_KEY"): os.environ["LANGSMITH_API_KEY"] = getpass.getpass("Enter API key for LangSmith: ") if provider_key_name is None: provider_key_names = {"openai":"OPENAI_API_KEY", "anthropic":"ANTHROPIC_API_KEY", "azure":"AZURE_OPENAI_API_KEY", "groq":"GROQ_API_KEY", "cohere":"COHERE_API_KEY", "nvidia":"NVIDIA_API_KEY", "fireworks":"FIREWORKS_API_KEY", "mistralai":"MISTRAL_API_KEY", "together":"TOGETHER_API_KEY", "ibm":"WATSONX_APIKEY", "databricks":"DATABRICKS_TOKEN", "xai":"XAI_API_KEY"} if model_provider in provider_key_names.keys(): provider_key_name = provider_key_names[model_provider] if provider_key != None: os.environ[provider_key_name] = provider_key elif provider_key_name != None and not os.environ.get(provider_key_name): os.environ[provider_key_name] = getpass.getpass(f"Enter API key for {model_provider}: ") # special cases if model_provider == "azure": # pip install -qU "langchain[openai]" from langchain_openai import AzureChatOpenAI llm = AzureChatOpenAI( azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"], azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"], openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], ) elif model_provider == "ibm": if 'url' in kwargs and 'project_id' in kwargs: # pip install -qU "langchain-ibm" from langchain_ibm import ChatWatsonx llm = ChatWatsonx(model_id = model_name, url=kwargs.get('url'), project_id=kwargs.get('project_id') ) else: raise Exception("Pass 'url' and 'project_id' to initialize a ChatWatsonx model.") elif model_provider == "databricks": if 'databricks_host_url' in kwargs: os.environ["DATABRICKS_HOST"] = kwargs.get('databricks_host_url') else: raise Exception("Pass 'databricks_host_url' to initialize a Databricks model.") # pip install -qU "databricks-langchain" from databricks_langchain import ChatDatabricks llm = ChatDatabricks(endpoint=model_name) else: try: llm = init_chat_model(model_name, model_provider=model_provider) except: logging.error("Could not initialize chat model.") raise Exception if not isinstance(structure,str) and issubclass(structure, BaseModel): if 'change' in structure.model_fields: self.structure = structure else: logging.error("A custom BaseModel needs to have a field named 'change'.") raise Exception elif structure == "float": self.structure = SCFloat elif structure == "DURel": self.structure = SCDURel else: self.structure = None if self.structure != None: self.model = llm.with_structured_output(self.structure) else: self.model = llm
[docs] def get_response(self, target_usages : List[TargetUsage], system_message = 'You are a lexicographer', user_prompt_template = 'Please provide a number measuring how different the meaning of the word \'{target}\' is between the following example sentences: \n1. {usage_1}\n2. {usage_2}', lemmatize = True): """ Takes as input two target usages and returns the degree of semantic change between them, using a chat model with structured output. Args: target_usages (List[TargetUsage]): a list of target usages with the same target word. system_message (str): the system message to use in the prompt user_prompt_template (str): template to use for the user message in the prompt. lemmatize (bool): whether the target word should be lemmatized in the prompt or not. Uses trankit to lemmatize. Returns: int or float or str: the degree of semantic change between the two instances of the target word, alternatively the whole message content if the output is not structured. """ assert len(target_usages) == 2 words = [] sentences = [] for usage in target_usages: words.append(usage.text()[usage.offsets[0]:usage.offsets[1]]) sentences.append(usage.text()) def get_lemma(tokenized, usage): for token in tokenized['tokens']: if token['span'] == tuple(usage.offsets): return(token['lemma']) if lemmatize: if self.language == None: logging.error("Could not lemmatize using trankit because no language is set. Please pass a value to 'language' when initializing the model.") raise Exception p = trankit.Pipeline(self.language) lemmatized = [p.lemmatize(sentence, is_sent = True) for sentence in sentences] lemmas = [get_lemma(lemmatized[i], target_usages[i]) for i in range(2)] if lemmas[0] != lemmas[1]: logging.info("Lemmas of the two target words differ, are you sure they are different forms of the same lexeme?") target = lemmas[0] else: target = words[0] prompt_template = ChatPromptTemplate.from_messages( [("system", system_message), ("user", user_prompt_template)] ) prompt = prompt_template.invoke({"target": target, "usage_1": sentences[0], "usage_2": sentences[1]}) try: response = self.model.invoke(prompt) except: logging.error("Could not run chat completion.") raise Exception try: return response.change except: return response