diff --git a/requirements.txt b/requirements.txt index 7864c9faa4950728fb103595c4deb7bc2dc05da6..a06fd49cc67632eaf0fe3e4a0dd3608450979a44 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,10 @@ -fastapi==0.100.1 +fastapi==0.103.1 hypercorn==0.14.4 toml==0.10.2 email-validator==2.0.0.post2 python-dotenv==1.0.0 passlib[bcrypt]==1.7.4 -httpx[http2]==0.24.1 -pymitter==0.4.0 +pymitter==0.4.2 uvicorn==0.23.2 python-multipart==0.0.6 -nacsos_data[scripts,server] @ git+ssh://git@gitlab.pik-potsdam.de/mcc-apsis/nacsos/nacsos-data.git@v0.9.5 +nacsos_data[scripts,server,utils] @ git+ssh://git@gitlab.pik-potsdam.de/mcc-apsis/nacsos/nacsos-data.git@v0.10.0 diff --git a/requirements_dev.txt b/requirements_dev.txt index 86287e88a638bec4089ea6ec9f421fd4b0f91b26..ef23b06b94319b915abc64bca23627da0c5d4070 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,7 +1,7 @@ flake8==6.1.0 tox==4.6.4 -pytest==7.4.0 +pytest==7.4.2 pytest-cov==4.1.0 -mypy==1.4.1 +mypy==1.5.1 types-toml==0.10.8.7 types-PyYAML==6.0.12.11 \ No newline at end of file diff --git a/server/__init__.py b/server/__init__.py index 0c74e944bcc9db5458bef6253d62e1b9f8cfe6c3..80c67b0d01689c478908eba7f2bdd33bdcb9a55a 100644 --- a/server/__init__.py +++ b/server/__init__.py @@ -18,7 +18,8 @@ logger = get_logger('nacsos.server') app = FastAPI(openapi_url=settings.SERVER.OPENAPI_FILE, openapi_prefix=settings.SERVER.OPENAPI_PREFIX, - root_path=settings.SERVER.ROOT_PATH) + root_path=settings.SERVER.ROOT_PATH, + separate_input_output_schemas=False) logger.debug('Setting up server and middlewares') mimetypes.add_type('application/javascript', '.js') diff --git a/server/api/__init__.py b/server/api/__init__.py index b83e88c87cfd408775b139e56cd1c98554e98204..cab5d2d2b35579c663a8bab37d66a993081074b5 100644 --- a/server/api/__init__.py +++ b/server/api/__init__.py @@ -11,6 +11,7 @@ from .routes import highlight from .routes import stats from .routes import export from .routes import search +from .routes import evaluation # this router proxies all /api endpoints router = APIRouter() @@ -50,3 +51,6 @@ router.include_router(export.router, prefix='/export', tags=['export']) # route for searching data (e.g. in openalex) router.include_router(search.router, prefix='/search', tags=['search']) + +# route for computing evaluation metrics and other statistics +router.include_router(evaluation.router, prefix='/eval', tags=['evaluation']) diff --git a/server/api/routes/annotations.py b/server/api/routes/annotations.py index dea6327b992aca2d47d2eb3022ab3481bc285b5e..22e3b98bb984dadea698229f8585a05cfce677a4 100644 --- a/server/api/routes/annotations.py +++ b/server/api/routes/annotations.py @@ -5,76 +5,83 @@ from sqlalchemy import select from sqlalchemy.orm import load_only from fastapi import APIRouter, Depends, HTTPException, status as http_status, Query -from nacsos_data.db.schemas import \ - BotAnnotationMetaData, \ - AssignmentScope, \ - User, \ +from nacsos_data.db.schemas import ( + BotAnnotationMetaData, + AssignmentScope, + User, Annotation -from nacsos_data.models.annotations import \ - AnnotationSchemeModel, \ - AssignmentScopeModel, \ - AssignmentModel, \ - AssignmentStatus, \ - AssignmentScopeConfig, \ - AnnotationSchemeModelFlat, \ - FlattenedAnnotationSchemeLabel -from nacsos_data.models.bot_annotations import \ - ResolutionMethod, \ - AnnotationFilters, \ - BotAnnotationModel, \ - AnnotationCollection, \ - BotMetaResolve, \ - GroupedBotAnnotation, \ - AnnotationCollectionDB, \ - BotKind, \ - BotAnnotationMetaDataBaseModel +) +from nacsos_data.models.annotations import ( + AnnotationSchemeModel, + AssignmentScopeModel, + AssignmentModel, + AssignmentStatus, + AssignmentScopeConfig, + AnnotationSchemeModelFlat +) +from nacsos_data.models.bot_annotations import ( + BotKind, + BotAnnotationMetaDataBaseModel, + BotAnnotationResolution, + ResolutionMatrix, + BotMetaResolveBase, + ResolutionProposal +) from nacsos_data.models.users import UserModel from nacsos_data.models.items import AnyItemModel from nacsos_data.db.crud.items import read_any_item_by_item_id from nacsos_data.db.crud.projects import read_project_by_id -from nacsos_data.db.crud.annotations import \ - read_assignment, \ - read_assignments_for_scope, \ - read_assignments_for_scope_for_user, \ - read_assignment_scopes_for_project, \ - read_assignment_scopes_for_project_for_user, \ - read_annotations_for_assignment, \ - read_next_assignment_for_scope_for_user, \ - read_next_open_assignment_for_scope_for_user, \ - read_annotation_scheme, \ - read_annotation_schemes_for_project, \ - upsert_annotations, \ - read_assignment_scope, \ - upsert_annotation_scheme, \ - delete_annotation_scheme, \ - upsert_assignment_scope, \ - delete_assignment_scope, \ - read_item_ids_with_assignment_count_for_project, \ - read_assignment_counts_for_scope, \ - ItemWithCount, \ - AssignmentCounts, \ - UserProjectAssignmentScope, \ - store_assignments, \ - store_resolved_bot_annotations, \ - update_resolved_bot_annotations, read_assignment_overview_for_scope, AssignmentScopeEntry -from nacsos_data.util.annotations.resolve import \ - AnnotationFilterObject, \ - get_resolved_item_annotations, \ - read_bot_annotations, ResolutionProposal -from nacsos_data.util.annotations.validation import \ - merge_scheme_and_annotations, \ - annotated_scheme_to_annotations, \ +from nacsos_data.db.crud.annotations import ( + read_assignment, + read_assignments_for_scope, + read_assignments_for_scope_for_user, + read_assignment_scopes_for_project, + read_assignment_scopes_for_project_for_user, + read_annotations_for_assignment, + read_next_assignment_for_scope_for_user, + read_next_open_assignment_for_scope_for_user, + read_annotation_schemes_for_project, + upsert_annotations, + read_assignment_scope, + upsert_annotation_scheme, + delete_annotation_scheme, + upsert_assignment_scope, + delete_assignment_scope, + read_item_ids_with_assignment_count_for_project, + read_assignment_counts_for_scope, + ItemWithCount, + AssignmentCounts, + UserProjectAssignmentScope, + store_assignments, + store_resolved_bot_annotations, + update_resolved_bot_annotations, + read_assignment_overview_for_scope, + AssignmentScopeEntry, + read_resolved_bot_annotations, + read_resolved_bot_annotation_meta, + read_resolved_bot_annotations_for_meta +) +from nacsos_data.util.annotations.resolve import ( + AnnotationFilterObject, + get_resolved_item_annotations, + read_annotation_scheme +) +from nacsos_data.util.annotations.validation import ( + merge_scheme_and_annotations, + annotated_scheme_to_annotations, flatten_annotation_scheme +) from nacsos_data.util.annotations.assignments.random import random_assignments from nacsos_data.util.annotations.assignments.random_exclusion import random_assignments_with_exclusion -from server.api.errors import \ - SaveFailedError, \ - AssignmentScopeNotFoundError, \ - NoNextAssignmentWarning, \ - ProjectNotFoundError, \ - AnnotationSchemeNotFoundError, \ +from server.api.errors import ( + SaveFailedError, + AssignmentScopeNotFoundError, + NoNextAssignmentWarning, + ProjectNotFoundError, + AnnotationSchemeNotFoundError, MissingInformationError +) from server.util.security import UserPermissionChecker from server.data import db_engine @@ -108,7 +115,8 @@ async def get_scheme_definition(annotation_scheme_id: str, :param permissions: :return: a single annotation scheme """ - scheme = await read_annotation_scheme(annotation_scheme_id=annotation_scheme_id, db_engine=db_engine) + scheme: AnnotationSchemeModel | None = await read_annotation_scheme(annotation_scheme_id=annotation_scheme_id, + db_engine=db_engine) if scheme is not None: if flat: return flatten_annotation_scheme(scheme) @@ -415,104 +423,93 @@ async def get_annotators_for_scheme(scheme_id: str, .where(Annotation.annotation_scheme_id == scheme_id))).scalars().all()] -class SavedResolutionResponse(BaseModel): - name: str - meta: BotMetaResolve - saved: dict[str, list[GroupedBotAnnotation]] - - @router.get('/config/resolve/', response_model=ResolutionProposal) -async def get_resolved_annotations(strategy: ResolutionMethod, - scheme_id: str, - scope_id: list[str] | None = Query(default=None), - user_id: list[str] | None = Query(default=None), - key: list[str] | None = Query(default=None), - repeat: list[int] | None = Query(default=None), - ignore_order: bool | None = Query(default=False), - ignore_hierarchy: bool | None = Query(default=False), +async def get_resolved_annotations(settings: BotMetaResolveBase, include_empty: bool | None = Query(default=False), existing_resolution: str | None = Query(default=None), include_new: bool | None = Query(default=False), update_existing: bool | None = Query(default=False), - permissions=Depends(UserPermissionChecker('annotations_edit'))): + permissions=Depends(UserPermissionChecker('annotations_edit'))) \ + -> ResolutionProposal: """ Get all annotations that match the filters (e.g. all annotations made by users in scope with :scope_id). - Annotations are returned in a 3D matrix: - rows (dict entries): items (key: item_id) - columns (list index of dict entry): Label (key in scheme + repeat); index map in matrix.keys - cells: list of annotations by each user for item/Label combination :param include_new: :param update_existing: :param existing_resolution: :param include_empty: - :param strategy - :param scheme_id: - :param scope_id: - :param user_id: - :param key: - :param repeat: + :param settings :param permissions: - :param ignore_order: - :param ignore_hierarchy: :return: """ - filters = AnnotationFilters( - scheme_id=scheme_id, - scope_id=scope_id, - user_id=user_id, - key=key, - repeat=repeat, - ) - - if ignore_hierarchy is None: - ignore_hierarchy = False - if ignore_order is None: - ignore_order = False if include_empty is None: include_empty = True if include_new is None: include_new = False if update_existing is None: update_existing = False - return await get_resolved_item_annotations(strategy=strategy, - filters=AnnotationFilterObject.model_validate(filters.model_dump()), - ignore_order=ignore_order, - ignore_hierarchy=ignore_hierarchy, + + if existing_resolution is not None: + return await read_resolved_bot_annotations(db_engine=db_engine, + existing_resolution=existing_resolution, + include_new=include_new, + include_empty=include_empty, + update_existing=update_existing) + filters = AnnotationFilterObject.model_validate(settings.filters) + return await get_resolved_item_annotations(strategy=settings.algorithm, + filters=filters, + ignore_repeat=settings.ignore_repeat, + ignore_hierarchy=settings.ignore_hierarchy, include_new=include_new, include_empty=include_empty, update_existing=update_existing, - existing_resolution=existing_resolution, db_engine=db_engine) -class ResolutionPayload(BaseModel): - name: str - strategy: ResolutionMethod - filters: AnnotationFilters - ignore_order: bool # Refers to `annotation.repeat`, not `assignment.order`! - ignore_hierarchy: bool - collection: AnnotationCollectionDB - bot_annotations: list[BotAnnotationModel] +class SavedResolution(BaseModel): + meta: BotAnnotationResolution + proposal: ResolutionProposal + + +@router.get('/config/resolved/{bot_annotation_meta_id}', response_model=SavedResolution) +async def get_saved_resolved_annotations(bot_annotation_metadata_id: str, + permissions=Depends(UserPermissionChecker('annotations_edit'))) \ + -> SavedResolution: + async with db_engine.session() as session: # type: AsyncSession + bot_meta = await read_resolved_bot_annotation_meta(bot_annotation_metadata_id=bot_annotation_metadata_id, + session=session) + proposal = await read_resolved_bot_annotations_for_meta(session=session, + bot_meta=bot_meta, + include_new=False, + include_empty=False, + update_existing=False) + return SavedResolution(meta=bot_meta, proposal=proposal) @router.put('/config/resolve/', response_model=str) -async def save_resolved_annotations(data: ResolutionPayload, +async def save_resolved_annotations(settings: BotMetaResolveBase, + matrix: ResolutionMatrix, + name: str, permissions=Depends(UserPermissionChecker('annotations_edit'))): - meta_id = await store_resolved_bot_annotations( - project_id=permissions.permissions.project_id, name=data.name, algorithm=data.strategy, - filters=data.filters, ignore_hierarchy=data.ignore_hierarchy, ignore_repeat=data.ignore_order, - collection=data.collection, bot_annotations=data.bot_annotations, db_engine=db_engine) + meta_id = await store_resolved_bot_annotations(db_engine=db_engine, + project_id=permissions.permissions.project_id, + name=name, + algorithm=settings.algorithm, + filters=settings.filters, + ignore_hierarchy=settings.ignore_hierarchy, + ignore_repeat=settings.ignore_repeat, + matrix=matrix) return meta_id @router.put('/config/resolve/update') async def update_resolved_annotations(bot_annotation_metadata_id: str, name: str, - bot_annotations: list[BotAnnotationModel], + matrix: ResolutionMatrix, permissions=Depends(UserPermissionChecker('annotations_edit'))) -> None: + # TODO: allow update of filters and settings? await update_resolved_bot_annotations(bot_annotation_metadata_id=bot_annotation_metadata_id, - name=name, bot_annotations=bot_annotations, db_engine=db_engine) + name=name, matrix=matrix, db_engine=db_engine) @router.get('/config/resolved-list/', response_model=list[BotAnnotationMetaDataBaseModel]) @@ -535,29 +532,15 @@ async def list_saved_resolved_annotations(permissions=Depends(UserPermissionChec return [BotAnnotationMetaDataBaseModel.model_validate(e.__dict__) for e in exports] -@router.get('/config/resolved/{bot_annotation_meta_id}', response_model=SavedResolutionResponse) -async def get_saved_resolved_annotations(bot_annotation_metadata_id: str, - permissions=Depends(UserPermissionChecker('annotations_edit'))): - bot_annotations = await read_bot_annotations(bot_annotation_metadata_id=bot_annotation_metadata_id, - db_engine=db_engine) - async with db_engine.session() as session: # type: AsyncSession - meta: BotAnnotationMetaData = (await session.execute( - select(BotAnnotationMetaData) - .where(BotAnnotationMetaData.bot_annotation_metadata_id == bot_annotation_metadata_id))) \ - .scalars().one() - return SavedResolutionResponse( - name=meta.name, - meta=meta.meta, - saved=bot_annotations - ) - - @router.delete('/config/resolved/{bot_annotation_meta_id}') async def delete_saved_resolved_annotations(bot_annotation_metadata_id: str, permissions=Depends(UserPermissionChecker('annotations_edit'))): async with db_engine.session() as session: # type: AsyncSession - meta: BotAnnotationMetaData = (await session.execute( + meta: BotAnnotationMetaData | None = (await session.execute( select(BotAnnotationMetaData) .where(BotAnnotationMetaData.bot_annotation_metadata_id == bot_annotation_metadata_id))) \ - .scalars().one() - await session.delete(meta) + .scalars().one_or_none() + if meta is not None: + await session.delete(meta) + # TODO: do we need to commit? + # TODO: ensure bot_annotations are deleted via cascade diff --git a/server/api/routes/evaluation.py b/server/api/routes/evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..187f4806561c4c52ed5fe2cc77db0abbbb7cfc0f --- /dev/null +++ b/server/api/routes/evaluation.py @@ -0,0 +1,129 @@ +import uuid +from typing import TYPE_CHECKING + +from fastapi import APIRouter, BackgroundTasks, Depends +from nacsos_data.db.crud import upsert_orm +from nacsos_data.db.schemas import AnnotationTracker +from nacsos_data.models.annotation_tracker import AnnotationTrackerModel +from nacsos_data.util.annotations.evaluation import get_new_label_batches +from nacsos_data.util.annotations.evaluation.buscar import ( + calculate_h0s_for_batches, + compute_recall, + calculate_h0s) +from nacsos_data.util.annotations.evaluation.label_transform import annotations_to_sequence, get_annotations +from nacsos_data.util.auth import UserPermissions +from sqlalchemy import select + +from server.data import db_engine +from server.api.errors import DataNotFoundWarning +from server.util.logging import get_logger +from server.util.security import UserPermissionChecker + +if TYPE_CHECKING: + from sqlalchemy.ext.asyncio import AsyncSession # noqa F401 + +logger = get_logger('nacsos.api.route.eval') +logger.debug('Setup nacsos.api.route.eval router') + +router = APIRouter() + + +async def read_tracker(session: AsyncSession, tracker_id: str | uuid.UUID, + project_id: str | uuid.UUID | None = None) -> AnnotationTracker: + stmt = (select(AnnotationTracker) + .where(AnnotationTracker.annotation_tracking_id == tracker_id)) + rslt = (await session.scalars(stmt)).one_or_none() + if rslt is None: + raise DataNotFoundWarning(f'No Tracker in project {project_id} for id {tracker_id}!') + return rslt + + +@router.get('/tracking/tracker/{tracker_id}', response_model=AnnotationTrackerModel) +async def get_tracker(tracker_id: str, + permissions: UserPermissions = Depends(UserPermissionChecker('annotations_read'))) \ + -> AnnotationTrackerModel: + async with db_engine.session() as session: # type: AsyncSession + return AnnotationTrackerModel.model_validate(read_tracker(tracker_id=tracker_id, session=session, + project_id=permissions.permissions.project_id)) + + +@router.put('/tracking/tracker', response_model=str) +async def save_tracker(tracker: AnnotationTrackerModel, + permissions: UserPermissions = Depends(UserPermissionChecker('annotations_read'))) -> str: + pkey = await upsert_orm(upsert_model=tracker, Schema=AnnotationTracker, + primary_key='annotation_tracking_id', db_engine=db_engine, + skip_update=['labels', 'recall', 'buscar']) + return str(pkey) + + +@router.post('/tracking/refresh', response_model=AnnotationTrackerModel) +async def update_tracker(tracker_id: str, + background_tasks: BackgroundTasks, + batch_size: int | None = None, + reset: bool = False, + permissions: UserPermissions = Depends(UserPermissionChecker('annotations_edit'))) \ + -> AnnotationTrackerModel: + async with db_engine.session() as session: # type: AsyncSession + tracker = await read_tracker(tracker_id=tracker_id, session=session, + project_id=permissions.permissions.project_id) + + batched_annotations = [await get_annotations(session=session, source_ids=[sid]) + for sid in tracker.source_ids] + batched_sequence = [annotations_to_sequence(tracker.inclusion_rule, annotations=annotations, + majority=tracker.majority) + for annotations in batched_annotations] + + diff: list[list[int]] | None = None + if reset: + tracker.buscar = None + tracker.recall = None + elif tracker.labels is not None: + diff = get_new_label_batches(tracker.labels, batched_sequence) + + # Update labels + tracker.labels = batched_sequence + await session.commit() + + # We are not handing over the existing tracker ORM, because the session is not persistent + background_tasks.add_task(bg_populate_tracker, tracker_id, batch_size, diff) + + return AnnotationTrackerModel.model_validate(tracker) + + +async def bg_populate_tracker(tracker_id: str, batch_size: int | None = None, labels: list[list[int]] | None = None): + async with db_engine.session() as session: # type: AsyncSession + tracker = await read_tracker(tracker_id=tracker_id, session=session) + + if labels is None: + labels = tracker.labels + + flat_labels = [lab for batch in labels for lab in batch] + + recall = compute_recall(labels_=flat_labels) + if tracker.recall is None: + tracker.recall = recall + else: + tracker.recall += recall + + await session.commit() + + # Initialise buscar scores + if tracker.buscar is None: + tracker.buscar = [] + + if batch_size is None: + # Use scopes as batches + it = calculate_h0s_for_batches(labels_=tracker.labels, + recall_target=tracker.recall_target, + n_docs=tracker.n_items_total) + else: + # Ignore the batches derived from scopes and use fixed step sizes + it = calculate_h0s(labels_=flat_labels, + batch_size=batch_size, + recall_target=tracker.recall_target, + n_docs=tracker.n_items_total) + + for x, y in it: + tracker.buscar = tracker.buscar + [(x, y)] + # save after each step, so the user can refresh the page and get data as it becomes available + await session.commit() diff --git a/server/api/routes/export.py b/server/api/routes/export.py index 998280eb386e2d661b561c7c4c2dfc77847477b2..06dfe528763d8d6cfb6ac7d1eae7711734c2c8a1 100644 --- a/server/api/routes/export.py +++ b/server/api/routes/export.py @@ -46,7 +46,7 @@ async def get_annotations_csv(labels: list[LabelOptions], assignment_scope_ids: list[str] | None = Query(default=None), user_ids: list[str] | None = Query(default=None), ignore_hierarchy: bool = Query(default=True), - ignore_order: bool = Query(default=True), + ignore_repeat: bool = Query(default=True), item_fields: list[str] | None = Query(default=None), permissions: UserPermissions = Depends(UserPermissionChecker('annotations_read'))): result = await prepare_export_table(bot_annotation_metadata_ids=bot_annotation_metadata_ids, @@ -54,7 +54,7 @@ async def get_annotations_csv(labels: list[LabelOptions], user_ids=user_ids, project_id=permissions.permissions.project_id, labels=labels, - ignore_order=ignore_order, + ignore_repeat=ignore_repeat, ignore_hierarchy=ignore_hierarchy, item_fields=item_fields, db_engine=db_engine)