Source code for wsipipe.datasets.stripai

"""
This module creates the dataframe for the STRIP AI dataset with the following columns:
    - The slide column stores the paths on disk of the whole slide images
    - The annotation column records a string with the slide label 
    - The label column is the slide level label
    - The tags column contains the center and patient for each slide

This assumes there is a folder on disk structured the same as downloading
from the kaggle website
https://www.kaggle.com/competitions/mayo-clinic-strip-ai/data

"""

import os
from pathlib import Path
import pandas as pd

[docs]def convert_to_pyramids(data_root: Path = Path("data", "mayo-clinic-strip-ai"), out_root: Path = Path("experiments", "mayo_pyramids"), project_root: Path = None): """ Create pyramids for whole slide images The whole slide images as downloaded only contain data at level 0, no other levels are present. This can make it slow to access the slides. This function will run over all the slides in the the dataset and write out copies that contain a pyramid of levels. Files are written to folder experiments/pyramids/ Args: mayo_path (Path, optional): a path relative to the project root that is the location of the strip ai data. Defaults to data/mayo-clinic-strip-ai. """ def convert(in_path, out_path): print(f"Converting {in_path}") os.system(f"vips tiffsave {in_path} {out_path} --compression=lzw --tile --tile-width=256 --tile-height=256 --pyramid") data_root = Path(data_root) out_root = Path(out_root) if project_root is not None: data_root = project_root / data_root out_root = project_root / out_root # train images for img_path in list((data_root / "train" / "train").glob("*.tif")): output_path = out_root / "train" / img_path.name if not output_path.exists(): convert(img_path, output_path) # test images for img_path in list((data_root / "test" / "test").glob("*.tif")): output_path = out_root / "test" / img_path.name if not output_path.exists(): convert(img_path, output_path)
[docs]def training(data_root: Path = Path("data", "mayo-clinic-strip-ai"), project_root: Path = None) -> pd.DataFrame: """ Create Strip AI training dataset This function goes through the input directories for the training slides, and matches up the slide paths with infomation in the csv It creates a dataframe with slide path with matching slide label stored for both label and annotation. The tags column stores the patient id and center id. Args: mayo_path (Path, optional): a path relative to the project root that is the location of the stripai data. Defaults to data/mayo-clinic-strip-ai. Returns: df (pd.DataFrame): A dataframe with columns slide, annotation, label and tags """ # set up the paths to the slides and annotations if project_root is not None: data_root = project_root / data_root dataset_root = data_root / "train" / "train" labels_df = pd.read_csv(data_root / "train.csv") # turn them into a data frame and pad with empty annotation paths slidepaths = [] annots = [] labels = [] tags = [] imagelist = list(dataset_root.glob("*.tif")) for sp in imagelist[0:10]: slidepaths.append(sp) imageid = sp.stem row = labels_df[labels_df.image_id == imageid] annot = row.label.iloc[0] label = row.label.iloc[0] tag = 'center' + str(row.center_id.iloc[0]) + "; patient " + str(row.patient_id.iloc[0]) annots.append(annot) labels.append(label) tags.append(tag) df = pd.DataFrame() df["slide"] = slidepaths df["annotation"] = annots df["label"] = labels df["tags"] = tags return df
# convert_to_pyramids()