Skip to content
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ flowchart TB
file_common_ist("<a href='https://github.com/openproblems-bio/task_template#file-format-common-ist-dataset'>Common iST Dataset</a>")
comp_data_processor[/"<a href='https://github.com/openproblems-bio/task_template#component-type-data-processor'>Data processor</a>"/]
file_spatial_dataset("<a href='https://github.com/openproblems-bio/task_template#file-format-raw-ist-dataset'>Raw iST Dataset</a>")
file_scrnaseq_reference("<a href='https://github.com/openproblems-bio/task_template#file-format-scrna-seq-reference'>scRNA-seq Reference</a>")
file_scrnaseq("<a href='https://github.com/openproblems-bio/task_template#file-format-scrna-seq-reference'>scRNA-seq Reference</a>")
comp_control_method[/"<a href='https://github.com/openproblems-bio/task_template#component-type-control-method'>Control Method</a>"/]
comp_method[/"<a href='https://github.com/openproblems-bio/task_template#component-type-method'>Method</a>"/]
comp_metric[/"<a href='https://github.com/openproblems-bio/task_template#component-type-metric'>Metric</a>"/]
Expand All @@ -48,11 +48,11 @@ flowchart TB
file_common_scrnaseq("<a href='https://github.com/openproblems-bio/task_template#file-format-common-sc-dataset'>Common SC Dataset</a>")
file_common_ist---comp_data_processor
comp_data_processor-->file_spatial_dataset
comp_data_processor-->file_scrnaseq_reference
comp_data_processor-->file_scrnaseq
file_spatial_dataset---comp_control_method
file_spatial_dataset---comp_method
file_scrnaseq_reference---comp_control_method
file_scrnaseq_reference---comp_metric
file_scrnaseq---comp_control_method
file_scrnaseq---comp_metric
comp_control_method-->file_prediction
comp_method-->file_prediction
comp_metric-->file_score
Expand Down Expand Up @@ -175,8 +175,8 @@ Arguments:
|:---|:---|:---|
| `--input_sp` | `file` | An unprocessed spatial imaging dataset stored as a zarr file. |
| `--input_sc` | `file` | An unprocessed dataset as output by a dataset loader. |
| `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. |
| `--output_scrnaseq_reference` | `file` | (*Output*) A single-cell reference dataset, preprocessed for this benchmark. |
<!-- | `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. | -->
| `--output_scrnaseq` | `file` | (*Output*) A single-cell dataset, preprocessed for this benchmark. |

</div>

Expand Down
2 changes: 1 addition & 1 deletion scripts/create_resources/resources.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ cat > /tmp/params.yaml << 'HERE'
input_states: s3://openproblems-data/resources/datasets/**/state.yaml
rename_keys: 'input:output_dataset'
output_state: '$id/state.yaml'
settings: '{"output_train": "$id/train.h5ad", "output_test": "$id/test.h5ad", "output_solution": "$id/solution.h5ad"}'
settings: '{"output_scrnaseq": "$id/output_scrnaseq.h5ad"}'
publish_dir: s3://openproblems-data/resources/task_template/datasets/
HERE

Expand Down
27 changes: 12 additions & 15 deletions scripts/create_resources/test_resources.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,22 @@ cd "$REPO_ROOT"

set -e

RAW_DATA=resources_test/common
DATASET_DIR=resources_test/task_template
RAW_DATA=resources_test/task_spatial_segmentation
DATASET_DIR=resources_test/task_spatial_segmentation

mkdir -p $DATASET_DIR

# process dataset
viash run src/data_processors/process_dataset/config.vsh.yaml -- \
--input $RAW_DATA/cxg_mouse_pancreas_atlas/dataset.h5ad \
--output_train $DATASET_DIR/cxg_mouse_pancreas_atlas/train.h5ad \
--output_test $DATASET_DIR/cxg_mouse_pancreas_atlas/test.h5ad \
--output_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad
--input_sp $RAW_DATA/mouse_brain_combined/common_ist.zarr \
--input_sc $RAW_DATA/mouse_brain_combined/common_scrnaseq.h5ad \
# --output_spatial_dataset $DATASET_DIR/output_spatial_dataset.zarr \
--output_scrnaseq $DATASET_DIR/mouse_brain_combined/output_scrnaseq.h5ad

# run one method
viash run src/methods/logistic_regression/config.vsh.yaml -- \
--input_train $DATASET_DIR/cxg_mouse_pancreas_atlas/train.h5ad \
--input_test $DATASET_DIR/cxg_mouse_pancreas_atlas/test.h5ad \
--output $DATASET_DIR/cxg_mouse_pancreas_atlas/prediction.h5ad
--input $DATASET_DIR/mouse_brain_combined/common_ist.zarr \
--output $DATASET_DIR/mouse_brain_combined/prediction.h5ad

# run one metric
viash run src/metrics/accuracy/config.vsh.yaml -- \
Expand All @@ -38,12 +37,10 @@ viash run src/metrics/accuracy/config.vsh.yaml -- \
--output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad

# write manual state.yaml. this is not actually necessary but you never know it might be useful
cat > $DATASET_DIR/cxg_mouse_pancreas_atlas/state.yaml << HERE
id: cxg_mouse_pancreas_atlas
train: !file train.h5ad
test: !file test.h5ad
solution: !file solution.h5ad
prediction: !file prediction.h5ad
cat > $DATASET_DIR/mouse_brain_combined/state.yaml << HERE
id: mouse_brain_combined
processed: !file output_scrnaseq.h5ad
segmentation: !file prediction.h5ad
score: !file score.h5ad
HERE

Expand Down
2 changes: 1 addition & 1 deletion scripts/run_benchmark/run_full_local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ publish_dir="resources/results/${RUN_ID}"
# write the parameters to file
cat > /tmp/params.yaml << HERE
input_states: resources/datasets/**/state.yaml
rename_keys: 'input_train:output_train;input_test:output_test;input_solution:output_solution'
rename_keys: 'input_scrnaseq:output_scrnaseq'
output_state: "state.yaml"
publish_dir: "$publish_dir"
HERE
Expand Down
2 changes: 1 addition & 1 deletion scripts/run_benchmark/run_full_seqeracloud.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ publish_dir="s3://openproblems-data/resources/task_template/results/${RUN_ID}"
# write the parameters to file
cat > /tmp/params.yaml << HERE
input_states: s3://openproblems-data/resources/task_template/datasets/**/state.yaml
rename_keys: 'input_train:output_train;input_test:output_test;input_solution:output_solution'
rename_keys: 'input_scrnaseq:output_scrnaseq'
output_state: "state.yaml"
publish_dir: "$publish_dir"
HERE
Expand Down
2 changes: 1 addition & 1 deletion src/api/comp_control_method.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ arguments:
required: true
direction: input
- name: "--input_scrnaseq_reference"
__merge__: file_scrnaseq_reference.yaml
__merge__: file_scrnaseq.yaml
direction: input
required: true
- name: --output
Expand Down
12 changes: 6 additions & 6 deletions src/api/comp_data_processor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@ argument_groups:
direction: input
- name: Outputs
arguments:
- name: "--output_spatial_dataset"
__merge__: file_spatial_dataset.yaml
direction: output
required: true
- name: "--output_scrnaseq_reference"
__merge__: file_scrnaseq_reference.yaml
# - name: "--output_spatial_dataset"
# __merge__: file_spatial_dataset.yaml
# direction: output
# required: true
- name: "--output_scrnaseq"
__merge__: file_scrnaseq.yaml
direction: output
required: true
test_resources:
Expand Down
2 changes: 1 addition & 1 deletion src/api/comp_metric.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ arguments:
direction: input
required: true
- name: "--input_scrnaseq_reference"
__merge__: file_scrnaseq_reference.yaml
__merge__: file_scrnaseq.yaml
direction: input
required: true
- name: "--output"
Expand Down
File renamed without changes.
21 changes: 17 additions & 4 deletions src/data_processors/process_dataset/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
__merge__: ../../api/comp_data_processor.yaml

name: process_dataset

arguments:
- name: "--method"
type: "string"
description: "The process method to assign train/test."
choices: ["batch", "random"]
default: "batch"
description: "The spatial technology data type."
choices: ["xenium"]
- name: "--obs_label"
type: "string"
description: "Which .obs slot to use as label."
Expand All @@ -18,14 +19,26 @@ arguments:
type: "integer"
description: "A seed for the subsampling."
example: 123
- name: "--config"
type: "string"
description: "Config file in json format for data processing parameters."
example: config/config_default.json

resources:
- type: python_script
path: script.py
- path: /common/helper_functions/subset_h5ad_by_format.py

engines:
- type: docker
#image: openproblems/base_pytorch_nvidia:1 # TODO: ideally get gpu image to work
image: openproblems/base_python:1
setup:
- type: python
packages: scikit-learn
__merge__:
- /src/base/setup_spatialdata_partial.yaml
- type: native


runners:
- type: executable
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"span": 1.0, "n_top_genes": 3000}
91 changes: 44 additions & 47 deletions src/data_processors/process_dataset/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,23 @@
import random
import numpy as np
import anndata as ad
import scanpy as sc
import openproblems as op
import spatialdata as sd
import json
import shutil

## VIASH START
par = {
'input_sp': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr',
'input_sc': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad',
'output_spatial_dataset': 'output_spatial_dataset.zarr',
'output_scrnaseq_reference': 'output_scrnaseq_reference.h5ad',
#'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr',
'output_scrnaseq': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq.h5ad',
'method': 'xenium',
'seed': 123,
'config': 'task_spatial_segmentation/src/data_processors/process_dataset/config/config_default.json'
}

meta = {
'resources_dir': 'target/executable/data_processors/process_dataset',
'config': 'target/executable/data_processors/process_dataset/.config.vsh.yaml'
Expand All @@ -19,7 +27,6 @@

# import helper functions
sys.path.append(meta['resources_dir'])
from subset_h5ad_by_format import subset_h5ad_by_format

config = op.project.read_viash_config(meta["config"])

Expand All @@ -29,54 +36,44 @@
random.seed(par["seed"])

print(">> Load data", flush=True)
adata = ad.read_h5ad(par["input"])
print("input:", adata)
adata = ad.read_h5ad(par["input_sc"])
print("input_sc:", adata)

print(f">> Process data using {par['method']} method")
if par["method"] == "batch":
batch_info = adata.obs[par["obs_batch"]]
batch_categories = batch_info.dtype.categories
test_batches = random.sample(list(batch_categories), 1)
is_test = [ x in test_batches for x in batch_info ]
elif par["method"] == "random":
train_ix = np.random.choice(adata.n_obs, round(adata.n_obs * 0.8), replace=False)
is_test = [ not x in train_ix for x in range(0, adata.n_obs) ]
print(f">> Process {par['method']} data")

# subset the different adatas
print(">> Figuring which data needs to be copied to which output file", flush=True)
# use par arguments to look for label and batch value in different slots
slot_mapping = {
"obs": {
"label": par["obs_label"],
"batch": par["obs_batch"],
}
}
if par['config']:
print(f">> Perform standard data preprocessing")
with open(par['config'], "r") as f:
config = json.load(f)

# Add config to params
for key, value in config.items():
setattr(par, key, value)

print(">> Creating train data", flush=True)
output_train = subset_h5ad_by_format(
adata[[not x for x in is_test]],
config,
"output_train",
slot_mapping
)
adata.layers["counts"] = adata.X.copy()

sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
adata.layers['normlog'] = adata.X

sc.pp.highly_variable_genes(
adata,
flavor="seurat_v3",
layer="counts",
span=par['span'],
n_top_genes=par['n_top_genes']
)

print(">> Creating test data", flush=True)
output_test = subset_h5ad_by_format(
adata[is_test],
config,
"output_test",
slot_mapping
)
adata.var.sort_values("means")
sc.pp.scale(adata, zero_center=False)
adata.layers['normlogscale'] = adata.X

adata.X = adata.layers['counts']

print(">> Creating solution data", flush=True)
output_solution = subset_h5ad_by_format(
adata[is_test],
config,
"output_solution",
slot_mapping
)
# cell area normalization
sc.pp.calculate_qc_metrics(adata, inplace=True)
for x in ['transcript_counts', 'n_genes_by_counts']:
adata.obs[f'canorm_{x}'] = adata.obs[f'{x}'] / adata.obs['cell_area']

print(">> Writing data", flush=True)
output_train.write_h5ad(par["output_train"])
output_test.write_h5ad(par["output_test"])
output_solution.write_h5ad(par["output_solution"])
adata.write_h5ad(par["output_scrnaseq"])
8 changes: 4 additions & 4 deletions src/methods/cellpose/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
name: cellpose
label: "Cellpose"
# TODO: update the summary, description and links
summary: "Output of the segmantation methot cellpose"
description: "Output of the segmantation methot cellpose"
summary: "Cellpose-SAM: cell and nucleus segmentation with superhuman generalization."
description: "cellpose is an anatomical segmentation algorithm written in Python 3."
links: # these should point to the documentation of the method
documentation: "https://github.com/openproblems-bio/task_ist_preprocessing"
repository: "https://github.com/openproblems-bio/task_ist_preprocessing"
documentation: "https://cellpose.readthedocs.io/en/latest/"
repository: "https://github.com/mouseland/cellpose"
references:
doi: "10.1038/s41592-020-01018-x"

Expand Down
12 changes: 6 additions & 6 deletions src/workflows/process_datasets/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ argument_groups:
direction: input
- name: Outputs
arguments:
- name: "--output_spatial_dataset"
__merge__: /src/api/file_spatial_dataset.yaml
direction: output
required: true
- name: "--output_scrnaseq_reference"
__merge__: /src/api/file_scrnaseq_reference.yaml
# - name: "--output_spatial_dataset"
# __merge__: /src/api/file_spatial_dataset.yaml
# direction: output
# required: true
- name: "--output_scrnaseq"
__merge__: /src/api/file_scrnaseq.yaml
direction: output
required: true

Expand Down
9 changes: 6 additions & 3 deletions src/workflows/process_datasets/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,16 @@ workflow run_wf {
"input_sc": "input_sc"
],
toState: [
output_spatial_dataset: "output_spatial_dataset",
output_scrnaseq_reference: "output_scrnaseq_reference"
// output_spatial_dataset: "output_spatial_dataset",
output_scrnaseq: "output_scrnaseq"
]
)

// only output the files for which an output file was specified
| setState(["output_spatial_dataset", "output_scrnaseq_reference"])
| setState([
// "output_spatial_dataset",
"output_scrnaseq"
])

emit:
output_ch
Expand Down
2 changes: 1 addition & 1 deletion src/workflows/run_benchmark/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ argument_groups:
direction: output
required: true
- name: "--input_scrnaseq_reference"
__merge__: /src/api/file_scrnaseq_reference.yaml
__merge__: /src/api/file_scrnaseq.yaml
direction: output
required: true
- name: Outputs
Expand Down
Loading