From af3cf2e96c741a2c031b61e0d0d9553acb61a93e Mon Sep 17 00:00:00 2001 From: f641l Date: Wed, 22 Apr 2026 18:06:31 +0200 Subject: [PATCH 01/12] changes many files --- README.md | 2 +- scripts/create_resources/resources.sh | 2 +- scripts/create_resources/test_resources.sh | 27 +++--- scripts/run_benchmark/run_full_local.sh | 2 +- scripts/run_benchmark/run_full_seqeracloud.sh | 2 +- src/api/comp_data_processor.yaml | 2 +- .../process_dataset/config.vsh.yaml | 9 +- .../config/config_default.json | 1 + src/data_processors/process_dataset/script.py | 85 +++++++++---------- src/methods/cellpose/config.vsh.yaml | 8 +- .../process_datasets/config.vsh.yaml | 4 +- src/workflows/process_datasets/main.nf | 4 +- 12 files changed, 70 insertions(+), 78 deletions(-) create mode 100644 src/data_processors/process_dataset/config/config_default.json diff --git a/README.md b/README.md index ccf6db4..9cde50f 100644 --- a/README.md +++ b/README.md @@ -176,7 +176,7 @@ Arguments: | `--input_sp` | `file` | An unprocessed spatial imaging dataset stored as a zarr file. | | `--input_sc` | `file` | An unprocessed dataset as output by a dataset loader. | | `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. | -| `--output_scrnaseq_reference` | `file` | (*Output*) A single-cell reference dataset, preprocessed for this benchmark. | +| `--output_scrnaseq` | `file` | (*Output*) A single-cell dataset, preprocessed for this benchmark. | diff --git a/scripts/create_resources/resources.sh b/scripts/create_resources/resources.sh index 57f4d68..4ba5075 100755 --- a/scripts/create_resources/resources.sh +++ b/scripts/create_resources/resources.sh @@ -18,7 +18,7 @@ cat > /tmp/params.yaml << 'HERE' input_states: s3://openproblems-data/resources/datasets/**/state.yaml rename_keys: 'input:output_dataset' output_state: '$id/state.yaml' -settings: '{"output_train": "$id/train.h5ad", "output_test": "$id/test.h5ad", "output_solution": "$id/solution.h5ad"}' +settings: '{"output_scrnaseq": "$id/output_scrnaseq.h5ad"}' publish_dir: s3://openproblems-data/resources/task_template/datasets/ HERE diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index 9cb372a..26074a9 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -13,23 +13,22 @@ cd "$REPO_ROOT" set -e -RAW_DATA=resources_test/common -DATASET_DIR=resources_test/task_template +RAW_DATA=resources_test/task_spatial_segmentation +DATASET_DIR=resources_test/task_spatial_segmentation mkdir -p $DATASET_DIR # process dataset viash run src/data_processors/process_dataset/config.vsh.yaml -- \ - --input $RAW_DATA/cxg_mouse_pancreas_atlas/dataset.h5ad \ - --output_train $DATASET_DIR/cxg_mouse_pancreas_atlas/train.h5ad \ - --output_test $DATASET_DIR/cxg_mouse_pancreas_atlas/test.h5ad \ - --output_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad + --input_sp $RAW_DATA/mouse_brain_combined/common_ist.zarr \ + --input_sc $RAW_DATA/mouse_brain_combined/common_scrnaseq.h5ad \ + --output_spatial_dataset $DATASET_DIR/output_spatial_dataset.zarr \ + --output_scrnaseq $DATASET_DIR/mouse_brain_combined/output_scrnaseq.h5ad # run one method viash run src/methods/logistic_regression/config.vsh.yaml -- \ - --input_train $DATASET_DIR/cxg_mouse_pancreas_atlas/train.h5ad \ - --input_test $DATASET_DIR/cxg_mouse_pancreas_atlas/test.h5ad \ - --output $DATASET_DIR/cxg_mouse_pancreas_atlas/prediction.h5ad + --input $DATASET_DIR/mouse_brain_combined/common_ist.zarr \ + --output $DATASET_DIR/mouse_brain_combined/prediction.h5ad # run one metric viash run src/metrics/accuracy/config.vsh.yaml -- \ @@ -38,12 +37,10 @@ viash run src/metrics/accuracy/config.vsh.yaml -- \ --output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad # write manual state.yaml. this is not actually necessary but you never know it might be useful -cat > $DATASET_DIR/cxg_mouse_pancreas_atlas/state.yaml << HERE -id: cxg_mouse_pancreas_atlas -train: !file train.h5ad -test: !file test.h5ad -solution: !file solution.h5ad -prediction: !file prediction.h5ad +cat > $DATASET_DIR/mouse_brain_combined/state.yaml << HERE +id: mouse_brain_combined +processed: !file output_scrnaseq.h5ad +segmentation: !file prediction.h5ad score: !file score.h5ad HERE diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh index f8c1585..4b1aa11 100755 --- a/scripts/run_benchmark/run_full_local.sh +++ b/scripts/run_benchmark/run_full_local.sh @@ -31,7 +31,7 @@ publish_dir="resources/results/${RUN_ID}" # write the parameters to file cat > /tmp/params.yaml << HERE input_states: resources/datasets/**/state.yaml -rename_keys: 'input_train:output_train;input_test:output_test;input_solution:output_solution' +rename_keys: 'input_scrnaseq:output_scrnaseq' output_state: "state.yaml" publish_dir: "$publish_dir" HERE diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh index 87d133c..83f37b2 100755 --- a/scripts/run_benchmark/run_full_seqeracloud.sh +++ b/scripts/run_benchmark/run_full_seqeracloud.sh @@ -23,7 +23,7 @@ publish_dir="s3://openproblems-data/resources/task_template/results/${RUN_ID}" # write the parameters to file cat > /tmp/params.yaml << HERE input_states: s3://openproblems-data/resources/task_template/datasets/**/state.yaml -rename_keys: 'input_train:output_train;input_test:output_test;input_solution:output_solution' +rename_keys: 'input_scrnaseq:output_scrnaseq' output_state: "state.yaml" publish_dir: "$publish_dir" HERE diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml index 22c77aa..9134d64 100644 --- a/src/api/comp_data_processor.yaml +++ b/src/api/comp_data_processor.yaml @@ -23,7 +23,7 @@ argument_groups: __merge__: file_spatial_dataset.yaml direction: output required: true - - name: "--output_scrnaseq_reference" + - name: "--output_scrnaseq" __merge__: file_scrnaseq_reference.yaml direction: output required: true diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index 0047ae1..92cdd12 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -3,9 +3,8 @@ name: process_dataset arguments: - name: "--method" type: "string" - description: "The process method to assign train/test." - choices: ["batch", "random"] - default: "batch" + description: "The spatial technology data type." + choices: ["xenium"] - name: "--obs_label" type: "string" description: "Which .obs slot to use as label." @@ -18,6 +17,10 @@ arguments: type: "integer" description: "A seed for the subsampling." example: 123 + - name: "--conf" + type: "string" + description: "Config file in json format for data processing parameters." + default: "config/config_default.json" resources: - type: python_script path: script.py diff --git a/src/data_processors/process_dataset/config/config_default.json b/src/data_processors/process_dataset/config/config_default.json new file mode 100644 index 0000000..8d52b6c --- /dev/null +++ b/src/data_processors/process_dataset/config/config_default.json @@ -0,0 +1 @@ +{"span": 1.0, "n_top_genes": 3000} \ No newline at end of file diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index 7cca2bd..97dbe78 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -2,14 +2,16 @@ import random import numpy as np import anndata as ad +import scanpy as sc import openproblems as op +import json ## VIASH START par = { 'input_sp': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr', 'input_sc': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad', - 'output_spatial_dataset': 'output_spatial_dataset.zarr', - 'output_scrnaseq_reference': 'output_scrnaseq_reference.h5ad', + 'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr', + 'output_scrnaseq': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq.h5ad', } meta = { 'resources_dir': 'target/executable/data_processors/process_dataset', @@ -19,7 +21,6 @@ # import helper functions sys.path.append(meta['resources_dir']) -from subset_h5ad_by_format import subset_h5ad_by_format config = op.project.read_viash_config(meta["config"]) @@ -29,54 +30,44 @@ random.seed(par["seed"]) print(">> Load data", flush=True) -adata = ad.read_h5ad(par["input"]) -print("input:", adata) +adata = ad.read_h5ad(par["input_sc"]) +print("input_sc:", adata) -print(f">> Process data using {par['method']} method") -if par["method"] == "batch": - batch_info = adata.obs[par["obs_batch"]] - batch_categories = batch_info.dtype.categories - test_batches = random.sample(list(batch_categories), 1) - is_test = [ x in test_batches for x in batch_info ] -elif par["method"] == "random": - train_ix = np.random.choice(adata.n_obs, round(adata.n_obs * 0.8), replace=False) - is_test = [ not x in train_ix for x in range(0, adata.n_obs) ] +print(f">> Process {par['method']} data") -# subset the different adatas -print(">> Figuring which data needs to be copied to which output file", flush=True) -# use par arguments to look for label and batch value in different slots -slot_mapping = { - "obs": { - "label": par["obs_label"], - "batch": par["obs_batch"], - } -} +if par['config']: + print(f">> Perform standard data preprocessing") + with open(par['config'], "r") as f: + config = json.load(f) + + # Add config to params + for key, value in config.items(): + setattr(par, key, value) -print(">> Creating train data", flush=True) -output_train = subset_h5ad_by_format( - adata[[not x for x in is_test]], - config, - "output_train", - slot_mapping -) + adata.layers["counts"] = adata.X.copy() + + sc.pp.normalize_total(adata) + sc.pp.log1p(adata) + adata.layers['normlog'] = adata.X + + sc.pp.highly_variable_genes( + adata, + flavor="seurat_v3", + layer="counts", + span=par['span'], + n_top_genes=par['n_top_genes'] + ) -print(">> Creating test data", flush=True) -output_test = subset_h5ad_by_format( - adata[is_test], - config, - "output_test", - slot_mapping -) + adata.var.sort_values("means") + sc.pp.scale(adata, zero_center=False) + adata.layers['normlogscale'] = adata.X + + adata.X = adata.layers['counts'] -print(">> Creating solution data", flush=True) -output_solution = subset_h5ad_by_format( - adata[is_test], - config, - "output_solution", - slot_mapping -) + # cell area normalization + sc.pp.calculate_qc_metrics(adata, inplace=True) + for x in ['transcript_counts', 'n_genes_by_counts']: + adata.obs[f'canorm_{x}'] = adata.obs[f'{x}'] / adata.obs['cell_area'] print(">> Writing data", flush=True) -output_train.write_h5ad(par["output_train"]) -output_test.write_h5ad(par["output_test"]) -output_solution.write_h5ad(par["output_solution"]) +adata.write_h5ad(par["output_scrnaseq"]) diff --git a/src/methods/cellpose/config.vsh.yaml b/src/methods/cellpose/config.vsh.yaml index 46be884..47c6cec 100644 --- a/src/methods/cellpose/config.vsh.yaml +++ b/src/methods/cellpose/config.vsh.yaml @@ -1,11 +1,11 @@ name: cellpose label: "Cellpose" # TODO: update the summary, description and links -summary: "Output of the segmantation methot cellpose" -description: "Output of the segmantation methot cellpose" +summary: "Cellpose-SAM: cell and nucleus segmentation with superhuman generalization." +description: "cellpose is an anatomical segmentation algorithm written in Python 3." links: # these should point to the documentation of the method - documentation: "https://github.com/openproblems-bio/task_ist_preprocessing" - repository: "https://github.com/openproblems-bio/task_ist_preprocessing" + documentation: "https://cellpose.readthedocs.io/en/latest/" + repository: "https://github.com/mouseland/cellpose" references: doi: "10.1038/s41592-020-01018-x" diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml index c71286a..fe3b9d4 100644 --- a/src/workflows/process_datasets/config.vsh.yaml +++ b/src/workflows/process_datasets/config.vsh.yaml @@ -18,8 +18,8 @@ argument_groups: __merge__: /src/api/file_spatial_dataset.yaml direction: output required: true - - name: "--output_scrnaseq_reference" - __merge__: /src/api/file_scrnaseq_reference.yaml + - name: "--output_scrnaseq" + __merge__: /src/api/file_scrnaseq.yaml direction: output required: true diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf index 947a8f1..226e861 100644 --- a/src/workflows/process_datasets/main.nf +++ b/src/workflows/process_datasets/main.nf @@ -45,12 +45,12 @@ workflow run_wf { ], toState: [ output_spatial_dataset: "output_spatial_dataset", - output_scrnaseq_reference: "output_scrnaseq_reference" + output_scrnaseq: "output_scrnaseq" ] ) // only output the files for which an output file was specified - | setState(["output_spatial_dataset", "output_scrnaseq_reference"]) + | setState(["output_spatial_dataset", "output_scrnaseq"]) emit: output_ch From ac9c237ca2ae09ed80d7d4a801d9626919bb3483 Mon Sep 17 00:00:00 2001 From: f641l Date: Thu, 23 Apr 2026 20:02:44 +0200 Subject: [PATCH 02/12] bugfix --- README.md | 8 ++++---- src/api/comp_control_method.yaml | 2 +- src/api/comp_data_processor.yaml | 2 +- src/api/comp_metric.yaml | 2 +- .../{file_scrnaseq_reference.yaml => file_scrnaseq.yaml} | 0 src/data_processors/process_dataset/config.vsh.yaml | 4 +++- src/workflows/run_benchmark/config.vsh.yaml | 2 +- 7 files changed, 11 insertions(+), 9 deletions(-) rename src/api/{file_scrnaseq_reference.yaml => file_scrnaseq.yaml} (100%) diff --git a/README.md b/README.md index 9cde50f..26ed0b1 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ flowchart TB file_common_ist("Common iST Dataset") comp_data_processor[/"Data processor"/] file_spatial_dataset("Raw iST Dataset") - file_scrnaseq_reference("scRNA-seq Reference") + file_scrnaseq("scRNA-seq Reference") comp_control_method[/"Control Method"/] comp_method[/"Method"/] comp_metric[/"Metric"/] @@ -48,11 +48,11 @@ flowchart TB file_common_scrnaseq("Common SC Dataset") file_common_ist---comp_data_processor comp_data_processor-->file_spatial_dataset - comp_data_processor-->file_scrnaseq_reference + comp_data_processor-->file_scrnaseq file_spatial_dataset---comp_control_method file_spatial_dataset---comp_method - file_scrnaseq_reference---comp_control_method - file_scrnaseq_reference---comp_metric + file_scrnaseq---comp_control_method + file_scrnaseq---comp_metric comp_control_method-->file_prediction comp_method-->file_prediction comp_metric-->file_score diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml index 3f4fa2e..694f004 100644 --- a/src/api/comp_control_method.yaml +++ b/src/api/comp_control_method.yaml @@ -17,7 +17,7 @@ arguments: required: true direction: input - name: "--input_scrnaseq_reference" - __merge__: file_scrnaseq_reference.yaml + __merge__: file_scrnaseq.yaml direction: input required: true - name: --output diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml index 9134d64..137cd12 100644 --- a/src/api/comp_data_processor.yaml +++ b/src/api/comp_data_processor.yaml @@ -24,7 +24,7 @@ argument_groups: direction: output required: true - name: "--output_scrnaseq" - __merge__: file_scrnaseq_reference.yaml + __merge__: file_scrnaseq.yaml direction: output required: true test_resources: diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index a7470e9..e2d21e6 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -12,7 +12,7 @@ arguments: direction: input required: true - name: "--input_scrnaseq_reference" - __merge__: file_scrnaseq_reference.yaml + __merge__: file_scrnaseq.yaml direction: input required: true - name: "--output" diff --git a/src/api/file_scrnaseq_reference.yaml b/src/api/file_scrnaseq.yaml similarity index 100% rename from src/api/file_scrnaseq_reference.yaml rename to src/api/file_scrnaseq.yaml diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index 92cdd12..0aa574c 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -1,5 +1,7 @@ __merge__: ../../api/comp_data_processor.yaml + name: process_dataset + arguments: - name: "--method" type: "string" @@ -21,10 +23,10 @@ arguments: type: "string" description: "Config file in json format for data processing parameters." default: "config/config_default.json" + resources: - type: python_script path: script.py - - path: /common/helper_functions/subset_h5ad_by_format.py engines: - type: docker diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 4ab5f83..dd7f49b 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -9,7 +9,7 @@ argument_groups: direction: output required: true - name: "--input_scrnaseq_reference" - __merge__: /src/api/file_scrnaseq_reference.yaml + __merge__: /src/api/file_scrnaseq.yaml direction: output required: true - name: Outputs From cb6235df4b464ea334e99c473b5520478d5d6336 Mon Sep 17 00:00:00 2001 From: f641l Date: Thu, 23 Apr 2026 20:36:27 +0200 Subject: [PATCH 03/12] bugfix for data_process in config.vsh.yaml --- src/data_processors/process_dataset/config.vsh.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index 0aa574c..ba70f3a 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -19,7 +19,7 @@ arguments: type: "integer" description: "A seed for the subsampling." example: 123 - - name: "--conf" + - name: "--config" type: "string" description: "Config file in json format for data processing parameters." default: "config/config_default.json" From 4e9b8f218fa918f17df50e93fbf78983767fa945 Mon Sep 17 00:00:00 2001 From: f641l Date: Thu, 23 Apr 2026 20:58:29 +0200 Subject: [PATCH 04/12] bugfix for data_process in config.vsh.yaml and script.py --- src/data_processors/process_dataset/config.vsh.yaml | 6 ++++-- src/data_processors/process_dataset/script.py | 4 ++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index ba70f3a..c4fc3c6 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -20,9 +20,11 @@ arguments: description: "A seed for the subsampling." example: 123 - name: "--config" - type: "string" + type: file description: "Config file in json format for data processing parameters." - default: "config/config_default.json" + required: true + direction: input + example: config/config_default.json resources: - type: python_script diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index 97dbe78..e0954ce 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -12,7 +12,11 @@ 'input_sc': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad', 'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr', 'output_scrnaseq': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq.h5ad', + 'method': 'xenium', + 'seed': 123, + 'config': 'config/config_default.json' } + meta = { 'resources_dir': 'target/executable/data_processors/process_dataset', 'config': 'target/executable/data_processors/process_dataset/.config.vsh.yaml' From ae452afb82cf2a0ce475be92d1bb56c92ded7eeb Mon Sep 17 00:00:00 2001 From: f641l Date: Thu, 23 Apr 2026 21:11:45 +0200 Subject: [PATCH 05/12] bugfix for data_process in script.py --- src/data_processors/process_dataset/script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index e0954ce..d5e16f6 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -14,7 +14,7 @@ 'output_scrnaseq': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq.h5ad', 'method': 'xenium', 'seed': 123, - 'config': 'config/config_default.json' + 'config': 'task_spatial_segmentation/src/data_processors/process_dataset/config/config_default.json' } meta = { From 30d308b082be7a3bcff8618f0a5da9fc6ac6f8de Mon Sep 17 00:00:00 2001 From: f641l Date: Thu, 23 Apr 2026 21:16:39 +0200 Subject: [PATCH 06/12] bugfix for data_process in config.vsh.yml --- src/data_processors/process_dataset/config.vsh.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index c4fc3c6..64444ab 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -20,10 +20,8 @@ arguments: description: "A seed for the subsampling." example: 123 - name: "--config" - type: file + type: "string" description: "Config file in json format for data processing parameters." - required: true - direction: input example: config/config_default.json resources: From bc0de4e6399e5fe01706306b5697c7788c724780 Mon Sep 17 00:00:00 2001 From: f641l Date: Thu, 23 Apr 2026 21:26:16 +0200 Subject: [PATCH 07/12] bugfix for data_process in script.py --- src/data_processors/process_dataset/script.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index d5e16f6..7792c9b 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -5,6 +5,7 @@ import scanpy as sc import openproblems as op import json +import shutil ## VIASH START par = { @@ -75,3 +76,6 @@ print(">> Writing data", flush=True) adata.write_h5ad(par["output_scrnaseq"]) + +print(">> Writing spatial data", flush=True) +shutil.copy(par["input_sp"], par["output_spatial_dataset"]) \ No newline at end of file From 4e158a57f2c26b9782e984e6d66d99539dd3d5b5 Mon Sep 17 00:00:00 2001 From: f641l Date: Thu, 23 Apr 2026 21:32:41 +0200 Subject: [PATCH 08/12] bugfix for data_process in script.py --- src/data_processors/process_dataset/script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index 7792c9b..fb5ee5a 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -78,4 +78,4 @@ adata.write_h5ad(par["output_scrnaseq"]) print(">> Writing spatial data", flush=True) -shutil.copy(par["input_sp"], par["output_spatial_dataset"]) \ No newline at end of file +shutil.copytree(par["input_sp"], par["output_spatial_dataset"]) \ No newline at end of file From cb7d5b4291a569c1c27165a1451e47fa3830f509 Mon Sep 17 00:00:00 2001 From: f641l Date: Thu, 23 Apr 2026 21:37:56 +0200 Subject: [PATCH 09/12] bugfix for data_process in script.py --- src/data_processors/process_dataset/script.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index fb5ee5a..7a48f6d 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -4,6 +4,7 @@ import anndata as ad import scanpy as sc import openproblems as op +import spatialdata as sd import json import shutil From cd1f3d2305b656f0a063103d868e5b5c29c3f112 Mon Sep 17 00:00:00 2001 From: f641l Date: Thu, 23 Apr 2026 22:04:59 +0200 Subject: [PATCH 10/12] changing docker containter in config.vsh.yaml --- src/data_processors/process_dataset/config.vsh.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index 64444ab..d1dcd00 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -32,6 +32,17 @@ engines: - type: docker image: openproblems/base_python:1 + - type: docker + #image: openproblems/base_pytorch_nvidia:1 # TODO: ideally get gpu image to work + image: openproblems/base_python:1 + setup: + - type: python + packages: scikit-learn + __merge__: + - /src/base/setup_spatialdata_partial.yaml + - type: native + + runners: - type: executable - type: nextflow From 59ed4c16a52758e6490033fc46e1bbadbf48a0bb Mon Sep 17 00:00:00 2001 From: f641l Date: Thu, 23 Apr 2026 22:09:54 +0200 Subject: [PATCH 11/12] changing docker containter in config.vsh.yaml --- src/data_processors/process_dataset/config.vsh.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index d1dcd00..25700eb 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -29,9 +29,6 @@ resources: path: script.py engines: - - type: docker - image: openproblems/base_python:1 - - type: docker #image: openproblems/base_pytorch_nvidia:1 # TODO: ideally get gpu image to work image: openproblems/base_python:1 From c5dfac01afb90b305fbfae097c65c80becc421ed Mon Sep 17 00:00:00 2001 From: f641l Date: Fri, 24 Apr 2026 14:35:15 +0200 Subject: [PATCH 12/12] comment out output_spatial_dataset --- README.md | 2 +- scripts/create_resources/test_resources.sh | 2 +- src/api/comp_data_processor.yaml | 8 ++++---- src/data_processors/process_dataset/script.py | 7 ++----- src/workflows/process_datasets/config.vsh.yaml | 8 ++++---- src/workflows/process_datasets/main.nf | 7 +++++-- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 26ed0b1..7827c92 100644 --- a/README.md +++ b/README.md @@ -175,7 +175,7 @@ Arguments: |:---|:---|:---| | `--input_sp` | `file` | An unprocessed spatial imaging dataset stored as a zarr file. | | `--input_sc` | `file` | An unprocessed dataset as output by a dataset loader. | -| `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. | + | `--output_scrnaseq` | `file` | (*Output*) A single-cell dataset, preprocessed for this benchmark. | diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index 26074a9..b9b99de 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -22,7 +22,7 @@ mkdir -p $DATASET_DIR viash run src/data_processors/process_dataset/config.vsh.yaml -- \ --input_sp $RAW_DATA/mouse_brain_combined/common_ist.zarr \ --input_sc $RAW_DATA/mouse_brain_combined/common_scrnaseq.h5ad \ - --output_spatial_dataset $DATASET_DIR/output_spatial_dataset.zarr \ + # --output_spatial_dataset $DATASET_DIR/output_spatial_dataset.zarr \ --output_scrnaseq $DATASET_DIR/mouse_brain_combined/output_scrnaseq.h5ad # run one method diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml index 137cd12..50a1597 100644 --- a/src/api/comp_data_processor.yaml +++ b/src/api/comp_data_processor.yaml @@ -19,10 +19,10 @@ argument_groups: direction: input - name: Outputs arguments: - - name: "--output_spatial_dataset" - __merge__: file_spatial_dataset.yaml - direction: output - required: true + # - name: "--output_spatial_dataset" + # __merge__: file_spatial_dataset.yaml + # direction: output + # required: true - name: "--output_scrnaseq" __merge__: file_scrnaseq.yaml direction: output diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index 7a48f6d..cd3025e 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -12,7 +12,7 @@ par = { 'input_sp': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr', 'input_sc': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad', - 'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr', + #'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr', 'output_scrnaseq': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq.h5ad', 'method': 'xenium', 'seed': 123, @@ -76,7 +76,4 @@ adata.obs[f'canorm_{x}'] = adata.obs[f'{x}'] / adata.obs['cell_area'] print(">> Writing data", flush=True) -adata.write_h5ad(par["output_scrnaseq"]) - -print(">> Writing spatial data", flush=True) -shutil.copytree(par["input_sp"], par["output_spatial_dataset"]) \ No newline at end of file +adata.write_h5ad(par["output_scrnaseq"]) \ No newline at end of file diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml index fe3b9d4..127a9e1 100644 --- a/src/workflows/process_datasets/config.vsh.yaml +++ b/src/workflows/process_datasets/config.vsh.yaml @@ -14,10 +14,10 @@ argument_groups: direction: input - name: Outputs arguments: - - name: "--output_spatial_dataset" - __merge__: /src/api/file_spatial_dataset.yaml - direction: output - required: true + # - name: "--output_spatial_dataset" + # __merge__: /src/api/file_spatial_dataset.yaml + # direction: output + # required: true - name: "--output_scrnaseq" __merge__: /src/api/file_scrnaseq.yaml direction: output diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf index 226e861..2be995d 100644 --- a/src/workflows/process_datasets/main.nf +++ b/src/workflows/process_datasets/main.nf @@ -44,13 +44,16 @@ workflow run_wf { "input_sc": "input_sc" ], toState: [ - output_spatial_dataset: "output_spatial_dataset", + // output_spatial_dataset: "output_spatial_dataset", output_scrnaseq: "output_scrnaseq" ] ) // only output the files for which an output file was specified - | setState(["output_spatial_dataset", "output_scrnaseq"]) + | setState([ + // "output_spatial_dataset", + "output_scrnaseq" + ]) emit: output_ch