arthur_bench.run#

Submodules#

class arthur_bench.run.testrun.TestRun(*, name: str, test_case_outputs: List[TestCaseOutput], description: str | None = None, model_name: str | None = None, foundation_model: str | None = None, prompt_template: str | None = None, model_version: str | None = None, test_suite_id: UUID, client: BenchClient, id: UUID | None = None)#

Bases: CreateRunRequest

class Config#

Bases: object

arbitrary_types_allowed = True#
property categories: List[str | None]#
client: BenchClient#
classmethod from_flattened(run_name: str, ids: List[UUID], candidate_output_list: List[str], scores: List[float] | List[ScoreResult], client: BenchClient, test_suite_id: UUID, model_name: str | None = None, model_version: str | None = None, foundation_model: str | None = None, prompt_template: str | None = None)#
id: UUID | None#
property output: List[str]#
save() UUID#

Save a test run.

property scores: List[float | None]#
test_suite_id: UUID#
class arthur_bench.run.testsuite.TestSuite(name: str, scoring_method: str | Scorer, description: str | None = None, reference_data: DataFrame | None = None, reference_data_path: str | None = None, input_column: str = 'input', reference_column: str = 'reference_output', input_text_list: List[str] | None = None, reference_output_list: List[str] | None = None, client: BenchClient | None = None)#

Bases: object

Reusable pipeline for running a test suite built from reference_data and evaluated using scoring_method

Parameters:
  • name – name of the test suite

  • scoring_method – scoring method or scorer instance to use to evaluate the results of a test run, as a string/enum or class instance

  • description – short description of the task tested by this suite

  • reference_data – dataframe of prompts and reference outputs

  • reference_data_path – filepath to csv of prompts and reference outputs, required if not specifying reference_data

  • input_column – the column of reference_data containing prompts, defaults to ‘prompt’

  • reference_column – the column of reference_data containing reference outputs, defaults to ‘reference’

  • input_text_list – list of strings of input texts that can be provided instead of dataframe columns

  • reference_output_list – list of strings of reference outputs that can be provided instead of dataframe columns

async arun(run_name: str, candidate_data: DataFrame | None = None, candidate_data_path: str | None = None, candidate_column: str = 'candidate_output', candidate_output_list: List[str] | None = None, context_column: str | None = None, context_list: List[str] | None = None, save: bool = True, batch_size: int = 5, model_name: str | None = None, model_version: str | None = None, foundation_model: str | None = None, prompt_template: str | None = None) TestRun#
property description: str | None#
property input_texts: List[str]#
property name: str#
property reference_outputs: List[str | None]#
run(run_name: str, candidate_data: DataFrame | None = None, candidate_data_path: str | None = None, candidate_column: str = 'candidate_output', candidate_output_list: List[str] | None = None, context_column: str | None = None, context_list: List[str] | None = None, save: bool = True, batch_size: int = 1, model_name: str | None = None, model_version: str | None = None, foundation_model: str | None = None, prompt_template: str | None = None) TestRun#

Score a test run on candidate outputs.

Parameters:
  • run_name – name for the test run

  • candidate_data – dataframe of candidate responses to test prompts

  • candidate_data_path – filepath to csv containing candidate responses to test prompts

  • candidate_column – the column of candidate data containing candidate responses, defaults to ‘candidate_output’

  • candidate_output_list – list of strings of candidate outputs that can be provided instead of dataframe

  • context_column – the column of reference_data containing supporting context for answering Question & Answering tasks

  • context_list – list of strings containing supporting context for answering question and answering tasks

  • save – whether to save the run results to file

  • batch_size – the batch_size to use when computing scores

  • model_name – model name for model used to generate outputs

  • model_version – model version of model used to generate outputs

  • foundation_model – foundation model name used to generate outputs

  • prompt_template – prompt template name used to generate outputs

Returns:

TestRun object containing scored outputs

save()#

Save a test suite to local file system.

property scoring_method: str#
property test_cases: List[TestCaseResponse]#