arthur_bench.scoring#
- class arthur_bench.scoring.ScoringMethodName(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)#
Bases:
str,Enum- BERTScore = 'bertscore'#
- ExactMatch = 'exact_match'#
- Hallucination = 'hallucination'#
- HedgingLanguage = 'hedging_language'#
- PythonUnitTesting = 'python_unit_testing'#
- QACorrectness = 'qa_correctness'#
- Readability = 'readability'#
- Specificity = 'specificity'#
- SummaryQuality = 'summary_quality'#
- WordCountMatch = 'word_count_match'#
- arthur_bench.scoring.scorer_from_string(method: str) type[arthur_bench.scoring.scorer.Scorer]#
Submodules#
- class arthur_bench.scoring.bertscore.BERTScore(model_type='microsoft/deberta-v3-base', precision_weight=0.1)#
Bases:
ScorerTailored bert score implementation.
https://arxiv.org/abs/1904.09675
- static name() str#
Get the name of this Scorer :return: the Scorer name
- run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[ScoreResult]#
Score a batch of candidate generations.
- Parameters:
candidate_batch – candidate generations to score
reference_batch – reference strings representing target outputs
input_text_batch – optional corresponding inputs
context_batch – optional corresponding contexts, if needed by scorer
- Returns:
scoring results for this batch. Float scores are deprecated, use ScoreResult instead
- to_dict(warn=False)#
Provides a json serializable representation of the scorer.
- class arthur_bench.scoring.exact_match.ExactMatch(case_sensitive=True)#
Bases:
ScorerReturns 1 if candidate matches reference, 0 if candidate does not match reference.
- static categories() List[Category]#
All possible values returned by the scorer if output type is categorical.
- static is_categorical() bool#
Whether the scorer is continuous or categorical. categories() should be implemented if True
- static name() str#
Get the name of this Scorer :return: the Scorer name
- run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[ScoreResult]#
Score a batch of candidate generations.
- Parameters:
candidate_batch – candidate generations to score
reference_batch – reference strings representing target outputs
input_text_batch – optional corresponding inputs
context_batch – optional corresponding contexts, if needed by scorer
- Returns:
scoring results for this batch. Float scores are deprecated, use ScoreResult instead
- class arthur_bench.scoring.hallucination.Hallucination#
Bases:
ScorerScore each output against a context using Arthur’s hosted hallucination checker A score of 1.0 means the hallucination checker estimates the output is supported by the context A score of 0.0 means the hallucination checker found information in the output unsupported by the context
- static categories() List[Category]#
All possible values returned by the scorer if output type is categorical.
- static is_categorical() bool#
Whether the scorer is continuous or categorical. categories() should be implemented if True
- static name() str#
Get the name of this Scorer :return: the Scorer name
- static requires_reference() bool#
True if scorer requires reference output to compute score, False otherwise
- run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[ScoreResult]#
Score a batch of candidate generations.
- Parameters:
candidate_batch – candidate generations to score
reference_batch – reference strings representing target outputs
input_text_batch – optional corresponding inputs
context_batch – optional corresponding contexts, if needed by scorer
- Returns:
scoring results for this batch. Float scores are deprecated, use ScoreResult instead
- to_dict(warn=False)#
Provides a json serializable representation of the scorer.
- class arthur_bench.scoring.hedging_language.HedgingLanguage(model_type: str = 'microsoft/deberta-v3-base', hedging_language: str = "As an AI language model, I don't have personal opinions, emotions, or beliefs.")#
Bases:
ScorerGiven an input question and model output, determine if the output contains hedging language such as “As an AI language model, I don’t have personal opinions, emotions, or beliefs”. The values returned are a similarity score (BERTScore), with higher values corresponding to higher likelihood of hedging language being present in the model output.
- static name() str#
Get the name of this Scorer :return: the Scorer name
- static requires_reference() bool#
True if scorer requires reference output to compute score, False otherwise
- run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[ScoreResult]#
Score a batch of candidate generations.
- Parameters:
candidate_batch – candidate generations to score
reference_batch – reference strings representing target outputs
input_text_batch – optional corresponding inputs
context_batch – optional corresponding contexts, if needed by scorer
- Returns:
scoring results for this batch. Float scores are deprecated, use ScoreResult instead
- to_dict(warn=False)#
Provides a json serializable representation of the scorer.
- class arthur_bench.scoring.python_unit_testing.PythonUnitTesting(unit_test_dir: str | None = None, unit_tests: List[str] | None = None)#
Bases:
ScorerWrapping the HuggingFace code_eval metric
Scores each candidate_output as a function against a pre-prepared unit test
Note: considers any code with non-standard python libraries (e.g. numpy) to have an error
https://huggingface.co/spaces/evaluate-metric/code_eval
- static categories() List[Category]#
All possible values returned by the scorer if output type is categorical.
- static is_categorical() bool#
Whether the scorer is continuous or categorical. categories() should be implemented if True
- static name() str#
Get the name of this Scorer :return: the Scorer name
- static requires_reference() bool#
True if scorer requires reference output to compute score, False otherwise
- run(candidate_outputs: List[str], reference_outputs: List[str] | None = None, inputs: List[str] | None = None, contexts: List[str] | None = None, batch_size: int = 1) List[ScoreResult]#
Score a set of test cases. This method doesn’t need to be implemented in most cases, but can be overriden to add additional functionality such as task-specific logging.
- Parameters:
candidate_outputs – candidate generations to score
reference_outputs – reference strings representing target outputs
inputs – input strings being tested
contexts – optional corresponding contexts, if needed by scorer
batch_size – size of batches
- Returns:
scoring results for this run. Float scores are deprecated, use ScoreResult instead
- run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[ScoreResult]#
Score a batch of candidate generations.
- Parameters:
candidate_batch – candidate generations to score
reference_batch – reference strings representing target outputs
input_text_batch – optional corresponding inputs
context_batch – optional corresponding contexts, if needed by scorer
- Returns:
scoring results for this batch. Float scores are deprecated, use ScoreResult instead
- to_dict(warn=False)#
Provides a json serializable representation of the scorer.
- class arthur_bench.scoring.qa_quality.QAQualityCorrectness(llm: BaseChatModel | None = None)#
Bases:
ScorerGiven an input question, context string, and model generation, determine if the generation produced a correct answer.
- async arun_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[float] | List[ScoreResult]#
Reference batch is not used for this scoring method, QA correctness requires an input_text_batch and context_batch
- static categories() List[Category]#
All possible values returned by the scorer if output type is categorical.
- static is_categorical() bool#
Whether the scorer is continuous or categorical. categories() should be implemented if True
- static name() str#
Get the name of this Scorer :return: the Scorer name
- static requires_reference() bool#
True if scorer requires reference output to compute score, False otherwise
- run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[ScoreResult]#
Reference batch is not used for this scoring method, QA correctness requires an input_text_batch and context_batch
- to_dict(warn=False)#
Provides a json serializable representation of the scorer.
- static validate_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) Tuple[List[str], List[str]]#
- class arthur_bench.scoring.readability.Readability#
Bases:
ScorerFlesch Reading Ease Score: the higher the score, the easier to read. Scores of 100-90 correlate to a 5th grade reading level, while scores <10 are classified as “Extremely difficult to read, and best understood by university graduates.”
https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
- static name() str#
Get the name of this Scorer :return: the Scorer name
- static requires_reference() bool#
True if scorer requires reference output to compute score, False otherwise
- run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[ScoreResult]#
Score a batch of candidate generations.
- Parameters:
candidate_batch – candidate generations to score
reference_batch – reference strings representing target outputs
input_text_batch – optional corresponding inputs
context_batch – optional corresponding contexts, if needed by scorer
- Returns:
scoring results for this batch. Float scores are deprecated, use ScoreResult instead
- class arthur_bench.scoring.scorer.Scorer#
Bases:
ABCBase class for all scorers. Compute a float score for a given model generation.
- async arun(candidate_outputs: List[str], reference_outputs: List[str] | None = None, inputs: List[str] | None = None, contexts: List[str] | None = None, batch_size: int = 5) List[float] | List[ScoreResult]#
Async version of run method.
- async arun_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[float] | List[ScoreResult]#
Async version of run_batch method.
- static categories() List[Category] | None#
All possible values returned by the scorer if output type is categorical.
- classmethod from_dict(config: dict)#
Load a scorer from a json configuration file.
- static is_categorical() bool#
Whether the scorer is continuous or categorical. categories() should be implemented if True
- abstract static name() str#
Get the name of this Scorer :return: the Scorer name
- static requires_reference() bool#
True if scorer requires reference output to compute score, False otherwise
- run(candidate_outputs: List[str], reference_outputs: List[str] | None = None, inputs: List[str] | None = None, contexts: List[str] | None = None, batch_size: int = 1) List[float] | List[ScoreResult]#
Score a set of test cases. This method doesn’t need to be implemented in most cases, but can be overriden to add additional functionality such as task-specific logging.
- Parameters:
candidate_outputs – candidate generations to score
reference_outputs – reference strings representing target outputs
inputs – input strings being tested
contexts – optional corresponding contexts, if needed by scorer
batch_size – size of batches
- Returns:
scoring results for this run. Float scores are deprecated, use ScoreResult instead
- abstract run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[float] | List[ScoreResult]#
Score a batch of candidate generations.
- Parameters:
candidate_batch – candidate generations to score
reference_batch – reference strings representing target outputs
input_text_batch – optional corresponding inputs
context_batch – optional corresponding contexts, if needed by scorer
- Returns:
scoring results for this batch. Float scores are deprecated, use ScoreResult instead
- to_dict(warn=False)#
Provides a json serializable representation of the scorer.
- to_metadata() ScoringMethod#
- classmethod type() ScoringMethodType#
Supplies whether a scorer is built-in or custom.
This method is implemented by checking whether the Scorer class is part of the arthur_bench.scoring module. :return: the type (built-in or custom)
- class arthur_bench.scoring.specificity.Specificity#
Bases:
ScorerReturns a score from 0.0 to 1.0 indicating how specific the candidate output language is. Higher scores indicate that the language is more specific, Lower scores indicate more vague language.
Specificity is computed through detecting words that indicate vagueness (predefined) determing how rare the words used are according to word frequencies calculated by popular nlp corpora, and detecting use of proper nouns and numbers.
- get_mean_word_freq(candidate_output: str) float#
Returns mean word frequency of candidate output. Higher values indicate that moree common words on average are used in the candidate output. Considers only words with frequency <0.001, truncating probability of words with higher frequencies to 0.001.
- get_num_vague_words(candidate_output: str) int#
Returns number of words in candidate_output which are is a list of pre-defined vague words.
- get_pn_and_num(candidate_output: str) int#
Returns total number of Proper Nouns and Numbers in candidate output. Determined heuristically via NNP and CD nltk tags.
- static name() str#
Get the name of this Scorer :return: the Scorer name
- static requires_reference() bool#
True if scorer requires reference output to compute score, False otherwise
- run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[ScoreResult]#
Score a batch of candidate generations.
- Parameters:
candidate_batch – candidate generations to score
reference_batch – reference strings representing target outputs
input_text_batch – optional corresponding inputs
context_batch – optional corresponding contexts, if needed by scorer
- Returns:
scoring results for this batch. Float scores are deprecated, use ScoreResult instead
- class arthur_bench.scoring.summary_quality.SummaryQuality(llm: BaseChatModel | None = None, context_window: int = 4096, tokenizer: Encoding | None = None)#
Bases:
ScorerComprehensive measure of summarization quality compared to a reference summary.
- async arun(candidate_outputs: List[str], reference_outputs: List[str] | None = None, inputs: List[str] | None = None, contexts: List[str] | None = None, batch_size: int = 5) List[float] | List[ScoreResult]#
Async version of run method.
- async arun_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[float] | List[ScoreResult]#
Summary quality requires input_text_batch. Asynchronous implementation
- static categories() List[Category]#
All possible values returned by the scorer if output type is categorical.
- static is_categorical() bool#
Whether the scorer is continuous or categorical. categories() should be implemented if True
- static name() str#
Get the name of this Scorer :return: the Scorer name
- run(candidate_outputs: List[str], reference_outputs: List[str] | None = None, inputs: List[str] | None = None, contexts: List[str] | None = None, batch_size: int = 1) List[ScoreResult] | List[float]#
Score a set of test cases. This method doesn’t need to be implemented in most cases, but can be overriden to add additional functionality such as task-specific logging.
- Parameters:
candidate_outputs – candidate generations to score
reference_outputs – reference strings representing target outputs
inputs – input strings being tested
contexts – optional corresponding contexts, if needed by scorer
batch_size – size of batches
- Returns:
scoring results for this run. Float scores are deprecated, use ScoreResult instead
- run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[ScoreResult]#
Summary quality requires input_text_batch.
- to_dict(warn=False)#
Provides a json serializable representation of the scorer.
- static validate_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) Tuple[List[str], List[str]]#
- arthur_bench.scoring.summary_quality.truncate_input_text(input_text, ref_output, cand_output, context_window: int = 4096, tokenizer: ~tiktoken.core.Encoding = <Encoding 'cl100k_base'>) Tuple[str, bool]#
Truncates the input_text to fit in LLM evaluator context
Truncate the input text so that the filled-in COMPARE prompt which contains {input text + summary A + summary B} fits in the evaluator context window
Returns the tuple (text, whether text was truncated)
- class arthur_bench.scoring.utils.suppress_warnings(logger_name: str)#
Bases:
objectA context-manager class to temporarily set the logging level for a logger to ERROR before returning it to its previous state.
- class arthur_bench.scoring.word_count_match.WordCountMatch#
Bases:
ScorerCalculates how similar the number of words in the candidate output is to the the number of words in the reference output. Scores span from 0 to 1. A score of 1.0 indicates that there are the same number of words in the candidate output as in the reference output. Scores less than 1.0 are calculated as ((len_reference-delta)/len_reference) where delta is the absolute difference in word lengths between the candidate and reference outputs. All negative computed values are truncated to 0. Utilizes lexicon count, removing punctuations: https://pypi.org/project/textstat/
- static name() str#
Get the name of this Scorer :return: the Scorer name
- run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[ScoreResult]#
Score a batch of candidate generations.
- Parameters:
candidate_batch – candidate generations to score
reference_batch – reference strings representing target outputs
input_text_batch – optional corresponding inputs
context_batch – optional corresponding contexts, if needed by scorer
- Returns:
scoring results for this batch. Float scores are deprecated, use ScoreResult instead