diff --git a/README.md b/README.md index ccf6db4..7827c92 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ flowchart TB file_common_ist("Common iST Dataset") comp_data_processor[/"Data processor"/] file_spatial_dataset("Raw iST Dataset") - file_scrnaseq_reference("scRNA-seq Reference") + file_scrnaseq("scRNA-seq Reference") comp_control_method[/"Control Method"/] comp_method[/"Method"/] comp_metric[/"Metric"/] @@ -48,11 +48,11 @@ flowchart TB file_common_scrnaseq("Common SC Dataset") file_common_ist---comp_data_processor comp_data_processor-->file_spatial_dataset - comp_data_processor-->file_scrnaseq_reference + comp_data_processor-->file_scrnaseq file_spatial_dataset---comp_control_method file_spatial_dataset---comp_method - file_scrnaseq_reference---comp_control_method - file_scrnaseq_reference---comp_metric + file_scrnaseq---comp_control_method + file_scrnaseq---comp_metric comp_control_method-->file_prediction comp_method-->file_prediction comp_metric-->file_score @@ -175,8 +175,8 @@ Arguments: |:---|:---|:---| | `--input_sp` | `file` | An unprocessed spatial imaging dataset stored as a zarr file. | | `--input_sc` | `file` | An unprocessed dataset as output by a dataset loader. | -| `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. | -| `--output_scrnaseq_reference` | `file` | (*Output*) A single-cell reference dataset, preprocessed for this benchmark. | + +| `--output_scrnaseq` | `file` | (*Output*) A single-cell dataset, preprocessed for this benchmark. | diff --git a/scripts/create_resources/resources.sh b/scripts/create_resources/resources.sh index 57f4d68..4ba5075 100755 --- a/scripts/create_resources/resources.sh +++ b/scripts/create_resources/resources.sh @@ -18,7 +18,7 @@ cat > /tmp/params.yaml << 'HERE' input_states: s3://openproblems-data/resources/datasets/**/state.yaml rename_keys: 'input:output_dataset' output_state: '$id/state.yaml' -settings: '{"output_train": "$id/train.h5ad", "output_test": "$id/test.h5ad", "output_solution": "$id/solution.h5ad"}' +settings: '{"output_scrnaseq": "$id/output_scrnaseq.h5ad"}' publish_dir: s3://openproblems-data/resources/task_template/datasets/ HERE diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index 9cb372a..b9b99de 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -13,23 +13,22 @@ cd "$REPO_ROOT" set -e -RAW_DATA=resources_test/common -DATASET_DIR=resources_test/task_template +RAW_DATA=resources_test/task_spatial_segmentation +DATASET_DIR=resources_test/task_spatial_segmentation mkdir -p $DATASET_DIR # process dataset viash run src/data_processors/process_dataset/config.vsh.yaml -- \ - --input $RAW_DATA/cxg_mouse_pancreas_atlas/dataset.h5ad \ - --output_train $DATASET_DIR/cxg_mouse_pancreas_atlas/train.h5ad \ - --output_test $DATASET_DIR/cxg_mouse_pancreas_atlas/test.h5ad \ - --output_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad + --input_sp $RAW_DATA/mouse_brain_combined/common_ist.zarr \ + --input_sc $RAW_DATA/mouse_brain_combined/common_scrnaseq.h5ad \ + # --output_spatial_dataset $DATASET_DIR/output_spatial_dataset.zarr \ + --output_scrnaseq $DATASET_DIR/mouse_brain_combined/output_scrnaseq.h5ad # run one method viash run src/methods/logistic_regression/config.vsh.yaml -- \ - --input_train $DATASET_DIR/cxg_mouse_pancreas_atlas/train.h5ad \ - --input_test $DATASET_DIR/cxg_mouse_pancreas_atlas/test.h5ad \ - --output $DATASET_DIR/cxg_mouse_pancreas_atlas/prediction.h5ad + --input $DATASET_DIR/mouse_brain_combined/common_ist.zarr \ + --output $DATASET_DIR/mouse_brain_combined/prediction.h5ad # run one metric viash run src/metrics/accuracy/config.vsh.yaml -- \ @@ -38,12 +37,10 @@ viash run src/metrics/accuracy/config.vsh.yaml -- \ --output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad # write manual state.yaml. this is not actually necessary but you never know it might be useful -cat > $DATASET_DIR/cxg_mouse_pancreas_atlas/state.yaml << HERE -id: cxg_mouse_pancreas_atlas -train: !file train.h5ad -test: !file test.h5ad -solution: !file solution.h5ad -prediction: !file prediction.h5ad +cat > $DATASET_DIR/mouse_brain_combined/state.yaml << HERE +id: mouse_brain_combined +processed: !file output_scrnaseq.h5ad +segmentation: !file prediction.h5ad score: !file score.h5ad HERE diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh index f8c1585..4b1aa11 100755 --- a/scripts/run_benchmark/run_full_local.sh +++ b/scripts/run_benchmark/run_full_local.sh @@ -31,7 +31,7 @@ publish_dir="resources/results/${RUN_ID}" # write the parameters to file cat > /tmp/params.yaml << HERE input_states: resources/datasets/**/state.yaml -rename_keys: 'input_train:output_train;input_test:output_test;input_solution:output_solution' +rename_keys: 'input_scrnaseq:output_scrnaseq' output_state: "state.yaml" publish_dir: "$publish_dir" HERE diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh index 87d133c..83f37b2 100755 --- a/scripts/run_benchmark/run_full_seqeracloud.sh +++ b/scripts/run_benchmark/run_full_seqeracloud.sh @@ -23,7 +23,7 @@ publish_dir="s3://openproblems-data/resources/task_template/results/${RUN_ID}" # write the parameters to file cat > /tmp/params.yaml << HERE input_states: s3://openproblems-data/resources/task_template/datasets/**/state.yaml -rename_keys: 'input_train:output_train;input_test:output_test;input_solution:output_solution' +rename_keys: 'input_scrnaseq:output_scrnaseq' output_state: "state.yaml" publish_dir: "$publish_dir" HERE diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml index 3f4fa2e..694f004 100644 --- a/src/api/comp_control_method.yaml +++ b/src/api/comp_control_method.yaml @@ -17,7 +17,7 @@ arguments: required: true direction: input - name: "--input_scrnaseq_reference" - __merge__: file_scrnaseq_reference.yaml + __merge__: file_scrnaseq.yaml direction: input required: true - name: --output diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml index 22c77aa..50a1597 100644 --- a/src/api/comp_data_processor.yaml +++ b/src/api/comp_data_processor.yaml @@ -19,12 +19,12 @@ argument_groups: direction: input - name: Outputs arguments: - - name: "--output_spatial_dataset" - __merge__: file_spatial_dataset.yaml - direction: output - required: true - - name: "--output_scrnaseq_reference" - __merge__: file_scrnaseq_reference.yaml + # - name: "--output_spatial_dataset" + # __merge__: file_spatial_dataset.yaml + # direction: output + # required: true + - name: "--output_scrnaseq" + __merge__: file_scrnaseq.yaml direction: output required: true test_resources: diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index a7470e9..e2d21e6 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -12,7 +12,7 @@ arguments: direction: input required: true - name: "--input_scrnaseq_reference" - __merge__: file_scrnaseq_reference.yaml + __merge__: file_scrnaseq.yaml direction: input required: true - name: "--output" diff --git a/src/api/file_scrnaseq_reference.yaml b/src/api/file_scrnaseq.yaml similarity index 100% rename from src/api/file_scrnaseq_reference.yaml rename to src/api/file_scrnaseq.yaml diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index 0047ae1..25700eb 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -1,11 +1,12 @@ __merge__: ../../api/comp_data_processor.yaml + name: process_dataset + arguments: - name: "--method" type: "string" - description: "The process method to assign train/test." - choices: ["batch", "random"] - default: "batch" + description: "The spatial technology data type." + choices: ["xenium"] - name: "--obs_label" type: "string" description: "Which .obs slot to use as label." @@ -18,14 +19,26 @@ arguments: type: "integer" description: "A seed for the subsampling." example: 123 + - name: "--config" + type: "string" + description: "Config file in json format for data processing parameters." + example: config/config_default.json + resources: - type: python_script path: script.py - - path: /common/helper_functions/subset_h5ad_by_format.py engines: - type: docker + #image: openproblems/base_pytorch_nvidia:1 # TODO: ideally get gpu image to work image: openproblems/base_python:1 + setup: + - type: python + packages: scikit-learn + __merge__: + - /src/base/setup_spatialdata_partial.yaml + - type: native + runners: - type: executable diff --git a/src/data_processors/process_dataset/config/config_default.json b/src/data_processors/process_dataset/config/config_default.json new file mode 100644 index 0000000..8d52b6c --- /dev/null +++ b/src/data_processors/process_dataset/config/config_default.json @@ -0,0 +1 @@ +{"span": 1.0, "n_top_genes": 3000} \ No newline at end of file diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index 7cca2bd..cd3025e 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -2,15 +2,23 @@ import random import numpy as np import anndata as ad +import scanpy as sc import openproblems as op +import spatialdata as sd +import json +import shutil ## VIASH START par = { 'input_sp': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr', 'input_sc': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad', - 'output_spatial_dataset': 'output_spatial_dataset.zarr', - 'output_scrnaseq_reference': 'output_scrnaseq_reference.h5ad', + #'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr', + 'output_scrnaseq': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq.h5ad', + 'method': 'xenium', + 'seed': 123, + 'config': 'task_spatial_segmentation/src/data_processors/process_dataset/config/config_default.json' } + meta = { 'resources_dir': 'target/executable/data_processors/process_dataset', 'config': 'target/executable/data_processors/process_dataset/.config.vsh.yaml' @@ -19,7 +27,6 @@ # import helper functions sys.path.append(meta['resources_dir']) -from subset_h5ad_by_format import subset_h5ad_by_format config = op.project.read_viash_config(meta["config"]) @@ -29,54 +36,44 @@ random.seed(par["seed"]) print(">> Load data", flush=True) -adata = ad.read_h5ad(par["input"]) -print("input:", adata) +adata = ad.read_h5ad(par["input_sc"]) +print("input_sc:", adata) -print(f">> Process data using {par['method']} method") -if par["method"] == "batch": - batch_info = adata.obs[par["obs_batch"]] - batch_categories = batch_info.dtype.categories - test_batches = random.sample(list(batch_categories), 1) - is_test = [ x in test_batches for x in batch_info ] -elif par["method"] == "random": - train_ix = np.random.choice(adata.n_obs, round(adata.n_obs * 0.8), replace=False) - is_test = [ not x in train_ix for x in range(0, adata.n_obs) ] +print(f">> Process {par['method']} data") -# subset the different adatas -print(">> Figuring which data needs to be copied to which output file", flush=True) -# use par arguments to look for label and batch value in different slots -slot_mapping = { - "obs": { - "label": par["obs_label"], - "batch": par["obs_batch"], - } -} +if par['config']: + print(f">> Perform standard data preprocessing") + with open(par['config'], "r") as f: + config = json.load(f) + + # Add config to params + for key, value in config.items(): + setattr(par, key, value) -print(">> Creating train data", flush=True) -output_train = subset_h5ad_by_format( - adata[[not x for x in is_test]], - config, - "output_train", - slot_mapping -) + adata.layers["counts"] = adata.X.copy() + + sc.pp.normalize_total(adata) + sc.pp.log1p(adata) + adata.layers['normlog'] = adata.X + + sc.pp.highly_variable_genes( + adata, + flavor="seurat_v3", + layer="counts", + span=par['span'], + n_top_genes=par['n_top_genes'] + ) -print(">> Creating test data", flush=True) -output_test = subset_h5ad_by_format( - adata[is_test], - config, - "output_test", - slot_mapping -) + adata.var.sort_values("means") + sc.pp.scale(adata, zero_center=False) + adata.layers['normlogscale'] = adata.X + + adata.X = adata.layers['counts'] -print(">> Creating solution data", flush=True) -output_solution = subset_h5ad_by_format( - adata[is_test], - config, - "output_solution", - slot_mapping -) + # cell area normalization + sc.pp.calculate_qc_metrics(adata, inplace=True) + for x in ['transcript_counts', 'n_genes_by_counts']: + adata.obs[f'canorm_{x}'] = adata.obs[f'{x}'] / adata.obs['cell_area'] print(">> Writing data", flush=True) -output_train.write_h5ad(par["output_train"]) -output_test.write_h5ad(par["output_test"]) -output_solution.write_h5ad(par["output_solution"]) +adata.write_h5ad(par["output_scrnaseq"]) \ No newline at end of file diff --git a/src/methods/cellpose/config.vsh.yaml b/src/methods/cellpose/config.vsh.yaml index 46be884..47c6cec 100644 --- a/src/methods/cellpose/config.vsh.yaml +++ b/src/methods/cellpose/config.vsh.yaml @@ -1,11 +1,11 @@ name: cellpose label: "Cellpose" # TODO: update the summary, description and links -summary: "Output of the segmantation methot cellpose" -description: "Output of the segmantation methot cellpose" +summary: "Cellpose-SAM: cell and nucleus segmentation with superhuman generalization." +description: "cellpose is an anatomical segmentation algorithm written in Python 3." links: # these should point to the documentation of the method - documentation: "https://github.com/openproblems-bio/task_ist_preprocessing" - repository: "https://github.com/openproblems-bio/task_ist_preprocessing" + documentation: "https://cellpose.readthedocs.io/en/latest/" + repository: "https://github.com/mouseland/cellpose" references: doi: "10.1038/s41592-020-01018-x" diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml index c71286a..127a9e1 100644 --- a/src/workflows/process_datasets/config.vsh.yaml +++ b/src/workflows/process_datasets/config.vsh.yaml @@ -14,12 +14,12 @@ argument_groups: direction: input - name: Outputs arguments: - - name: "--output_spatial_dataset" - __merge__: /src/api/file_spatial_dataset.yaml - direction: output - required: true - - name: "--output_scrnaseq_reference" - __merge__: /src/api/file_scrnaseq_reference.yaml + # - name: "--output_spatial_dataset" + # __merge__: /src/api/file_spatial_dataset.yaml + # direction: output + # required: true + - name: "--output_scrnaseq" + __merge__: /src/api/file_scrnaseq.yaml direction: output required: true diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf index 947a8f1..2be995d 100644 --- a/src/workflows/process_datasets/main.nf +++ b/src/workflows/process_datasets/main.nf @@ -44,13 +44,16 @@ workflow run_wf { "input_sc": "input_sc" ], toState: [ - output_spatial_dataset: "output_spatial_dataset", - output_scrnaseq_reference: "output_scrnaseq_reference" + // output_spatial_dataset: "output_spatial_dataset", + output_scrnaseq: "output_scrnaseq" ] ) // only output the files for which an output file was specified - | setState(["output_spatial_dataset", "output_scrnaseq_reference"]) + | setState([ + // "output_spatial_dataset", + "output_scrnaseq" + ]) emit: output_ch diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 4ab5f83..dd7f49b 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -9,7 +9,7 @@ argument_groups: direction: output required: true - name: "--input_scrnaseq_reference" - __merge__: /src/api/file_scrnaseq_reference.yaml + __merge__: /src/api/file_scrnaseq.yaml direction: output required: true - name: Outputs