arthur_bench.models#
Submodules#
- class arthur_bench.models.models.CategoricalHistogramItem(*, count: int, category: Category)#
Bases:
BaseModel- count: int#
- class arthur_bench.models.models.Category(*, name: str, description: str | None = None)#
Bases:
BaseModel- description: str | None#
- name: str#
- class arthur_bench.models.models.CommonSortEnum(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)#
Bases:
str,Enum- CREATED_AT_ASC = 'created_at'#
- CREATED_AT_DESC = '-created_at'#
- NAME_ASC = 'name'#
- NAME_DESC = '-name'#
- class arthur_bench.models.models.CreateRunRequest(*, name: str, test_case_outputs: List[TestCaseOutput], description: str | None = None, model_name: str | None = None, foundation_model: str | None = None, prompt_template: str | None = None, model_version: str | None = None)#
Bases:
BaseModel- classmethod consistent_categories(v)#
- description: str | None#
Optional description of the run
- foundation_model: str | None#
Optional foundation model name identifiying the pretrained model used to generate outputs
- model_name: str | None#
Optional model name identifying the model used to generate outputs
- model_version: str | None#
Optional model version identifying the version of the model used to generate outputs
- name: str#
Name identifier of the run
- prompt_template: str | None#
Optional prompt template name identifying the global prompt used to generate outputs
- test_cases: List[TestCaseOutput]#
List of outputs and scores for all cases in the test suite
- class arthur_bench.models.models.HistogramItem(*, count: int, low: float, high: float)#
Bases:
BaseModelBoundaries and count for a single bucket of a run histogram
- count: int#
- high: float#
- low: float#
- class arthur_bench.models.models.PaginatedRun(*, id: UUID, name: str, test_suite_id: UUID, test_case_runs: List[RunResult], updated_at: datetime, created_at: datetime, page: int | None = None, page_size: int | None = None, total_pages: int | None = None, total_count: int | None = None)#
Bases:
BaseModelPaginated list of prompts, reference outputs, model outputs, and scores for a particular run.
- created_at: datetime#
- id: UUID#
- name: str#
- page: int | None#
- page_size: int | None#
- test_suite_id: UUID#
- total_count: int | None#
- total_pages: int | None#
- updated_at: datetime#
- class arthur_bench.models.models.PaginatedRuns(*, test_runs: List[TestRunMetadata], page: int, page_size: int, total_pages: int, total_count: int)#
Bases:
BaseModelPaginated list of runs for a test suite.
- page: int#
- page_size: int#
- test_runs: List[TestRunMetadata]#
- total_count: int#
- total_pages: int#
- class arthur_bench.models.models.PaginatedTestSuite(*, id: UUID, name: str, scoring_method: ScoringMethod, test_cases: List[TestCaseResponse], created_at: datetime, updated_at: datetime, description: str | None = None, last_run_time: datetime | None = None, num_runs: int = 0, page: int | None = None, page_size: int | None = None, total_pages: int | None = None, total_count: int | None = None)#
Bases:
BaseModelTest suite and optional page information
- created_at: datetime#
- description: str | None#
- id: UUID#
- last_run_time: datetime | None#
- name: str#
- num_runs: int#
- page: int | None#
- page_size: int | None#
- scoring_method: ScoringMethod#
- test_cases: List[TestCaseResponse]#
- total_count: int | None#
- total_pages: int | None#
- updated_at: datetime#
- class arthur_bench.models.models.PaginatedTestSuites(*, test_suites: List[TestSuiteMetadata], page: int, page_size: int, total_pages: int, total_count: int)#
Bases:
BaseModelPaginated list of test suites.
- page: int#
- page_size: int#
- test_suites: List[TestSuiteMetadata]#
- total_count: int#
- total_pages: int#
- class arthur_bench.models.models.RunResult(*, id: UUID, output: str, score: float, input: str | None = None, reference_output: str | None = None, score_result: ScoreResult)#
Bases:
BaseModel- id: UUID#
- input: str | None#
- output: str#
- reference_output: str | None#
- score: float#
- score_result: ScoreResult#
- classmethod score_result_backwards_compatible(values)#
- class arthur_bench.models.models.ScoreResult(*, score: float | None = None, category: Category | None = None)#
Bases:
BaseModel- classmethod contains_score(values)#
- score: float | None#
- class arthur_bench.models.models.ScorerOutputType(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)#
Bases:
str,EnumIndicates the output type of the scorer
- Categorical = 'categorical'#
- Continuous = 'continuous'#
- class arthur_bench.models.models.ScoringMethod(*, name: str, type: ScoringMethodType, config: dict = {}, output_type: ScorerOutputType = ScorerOutputType.Continuous, categories: List[Category] | None = None)#
Bases:
BaseModelScoring method configuration
- categories: List[Category] | None#
Valid categories returned by the scorer. Only valid if categories is True.
- config: dict#
Configuration as used by the scorer to_dict and from_dict methods
- name: str#
Name of the scorer
- output_type: ScorerOutputType#
Whether the scoring method returns categorical scores
- classmethod scoring_method_categorical_defined(values)#
- type: ScoringMethodType#
Whether the scoring method was bench default or custom implementation
- class arthur_bench.models.models.ScoringMethodType(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)#
Bases:
str,EnumIndicates whether the scoring method was provided by the package or a custom implementation
- BuiltIn = 'built_in'#
- Custom = 'custom'#
- class arthur_bench.models.models.SummaryItem(*, id: UUID, name: str, avg_score: float, histogram: List[HistogramItem | CategoricalHistogramItem])#
Bases:
BaseModelAggregate statistics for a single run: average score and score distribution
- avg_score: float#
- classmethod either_continuous_or_categorical(v)#
Validate that the items in the histogram list are all containing low/high floats or are all containing a category
- histogram: List[HistogramItem | CategoricalHistogramItem]#
- id: UUID#
- name: str#
- class arthur_bench.models.models.TestCaseOutput(*, id: UUID, output: str, score: float | None = None, score_result: ScoreResult)#
Bases:
BaseModelA generated output, score pair
- id: UUID#
Optional unique identifier for this test case of the suite and run
- output: str#
Generated output for test case
- score: float | None#
Score assigned to output. This field is decprecated, used score_result instead
- score_result: ScoreResult#
Score information about output. Contains float score and / or category description
- classmethod score_result_backwards_compatible(values)#
- class arthur_bench.models.models.TestCaseRequest(*, input: str, reference_output: str | None = None)#
Bases:
BaseModelAn input, reference output pair.
- input: str#
Input to the test case. Does not include the prompt template.
- reference_output: str | None#
Reference or “Golden” output for the given input.
- class arthur_bench.models.models.TestCaseResponse(*, id: UUID, input: str, reference_output: str | None = None)#
Bases:
BaseModel- id: UUID#
- input: str#
Input to the test case. Does not include the prompt template.
- reference_output: str | None#
Reference or “Golden” output for the given input.
- class arthur_bench.models.models.TestCaseSortEnum(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)#
Bases:
str,Enum- SCORE_ASC = 'score'#
- SCORE_DESC = '-score'#
- class arthur_bench.models.models.TestRunMetadata(*, id: UUID, name: str, created_at: datetime, updated_at: datetime, avg_score: float | None = None, model_version: str | None = None, prompt_template: str | None = None)#
Bases:
BaseModel- avg_score: float | None#
- created_at: datetime#
- id: UUID#
- model_version: str | None#
- name: str#
- prompt_template: str | None#
- updated_at: datetime#
- class arthur_bench.models.models.TestRunSortEnum(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)#
Bases:
str,Enum- AVG_SCORE_ASC = 'avg_score'#
- AVG_SCORE_DESC = '-avg_score'#
- class arthur_bench.models.models.TestSuiteMetadata(*, id: UUID, name: str, scoring_method: ScoringMethod, last_run_time: datetime | None = None, description: str | None = None, created_at: datetime | None = None, updated_at: datetime | None = None)#
Bases:
BaseModel- created_at: datetime | None#
- description: str | None#
- id: UUID#
- last_run_time: datetime | None#
- name: str#
- scoring_method: ScoringMethod#
- updated_at: datetime | None#
- class arthur_bench.models.models.TestSuiteRequest(*, name: str, description: str | None = None, scoring_method: ScoringMethod, test_cases: ConstrainedListValue[TestCaseRequest])#
Bases:
BaseModelTest case data and metadata for the test suite.
- description: str | None#
Optional description of the test suite
- name: str#
Name of the test suite
- classmethod null_reference_outputs_all_or_none(v)#
Validate that all or none of test case reference outputs are null
- scoring_method: ScoringMethod#
Scoring configuration to use as criteria for the test suite
- classmethod scoring_method_backwards_compatible(v)#
- test_cases: List[TestCaseRequest]#
List of input texts and optional reference outputs to consistently score model generations against
- class arthur_bench.models.models.TestSuiteSortEnum(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)#
Bases:
str,Enum- LAST_RUNTIME_ASC = 'last_run_time'#
- LAST_RUNTIME_DESC = '-last_run_time'#
- class arthur_bench.models.models.TestSuiteSummary(*, summary: List[SummaryItem], page: int, page_size: int, total_pages: int, total_count: int, num_test_cases: int, categorical: bool = False)#
Bases:
BaseModelAggregate descriptions of runs of a test suite. Provides averages and score distributions
- categorical: bool#
- num_test_cases: int#
- page: int#
- page_size: int#
- summary: List[SummaryItem]#
- total_count: int#
- total_pages: int#