From fb243f5c3cbf613fe8f28b58efd53cca0069f0e3 Mon Sep 17 00:00:00 2001
From: ErinWeisbart <54687786+ErinWeisbart@users.noreply.github.com>
Date: Thu, 16 May 2024 14:38:42 -0700
Subject: [PATCH] add option to single resolution downsample

---
 documentation/DOZC-documentation/overview.md  |  19 +--
 .../DOZC-documentation/step_2_submit_jobs.md  |   6 +
 files/exampleJob.json                         |   5 +
 run.py                                        |  10 +-
 worker/B2R-worker.py                          | 120 ++++++++++++------
 worker/Dockerfile                             |   6 +-
 6 files changed, 115 insertions(+), 51 deletions(-)

diff --git a/documentation/DOZC-documentation/overview.md b/documentation/DOZC-documentation/overview.md
index 4bf45f9..ac6f849 100644
--- a/documentation/DOZC-documentation/overview.md
+++ b/documentation/DOZC-documentation/overview.md
@@ -1,6 +1,9 @@
-# What is Distributed-OMEZARRCreator?
+# What is Distributed-OMEZarrCreator?
+
+Distributed-OMEZarrCreator is a series of scripts designed to help you create and customize .ome.zarr files on [Amazon Web Services](https://aws.amazon.com/) (AWS) using AWS's file storage and computing systems.
+It includes a Dockerized version of [BioFormats2Raw](https://github.com/ome/bioformats2raw-docker) that can be used for conversion of many file formats to .ome.zarr format.
+It also includes an additional script to add a single extra resolution to your .ome.zarr that can be run at the time of conversion or on already-created .ome.zarr files.
 
-Distributed-OMEZARRCreator is a series of scripts designed to help you run a Dockerized version of [BioFormats2Raw](https://github.com/ome/bioformats2raw-docker) on [Amazon Web Services](https://aws.amazon.com/) (AWS) using AWS's file storage and computing systems.  
 * Data is stored in S3 buckets.
 * Software is run on "Spot Fleets" of computers (or instances) in the cloud.
 
@@ -9,7 +12,8 @@ Distributed-OMEZARRCreator is a series of scripts designed to help you run a Doc
 Docker is a software platform that packages software into containers.
 In a container is the software that you want to run as well as everything needed to run it (e.g. your software source code, operating system libraries, and dependencies).
 
-Dockerizing a workflow has many benefits including
+Dockerizing a workflow has many benefits including:
+
 * Ease of use: Dockerized software doesn't require the user to install anything themselves.
 * Reproducibility: You don't need to worry about results being affected by the version of your software or its dependencies being used as those are fixed.
 
@@ -20,17 +24,16 @@ This can give you access to far more computing power than you may have available
 
 Each piece of the infrastructure has to be added and configured separately, which can be time-consuming and confusing.
 
-Distributed-OMEZARRCreator tries to leverage the power of the former, while minimizing the problems of the latter.
+Distributed-OMEZarrCreator tries to leverage the power of the former, while minimizing the problems of the latter.
 
 ## What do I need to have to run this?
 
-Essentially all you need to run Distributed-OMEZARRCreator is an AWS account and a terminal program; see our [page on getting set up](step_0_prep.md) for all the specific steps you'll need to take.
-
+Essentially all you need to run Distributed-OMEZarrCreator is an AWS account and a terminal program; see our [page on getting set up](step_0_prep.md) for all the specific steps you'll need to take.
 
-## Can I contribute code to Distributed-OMEZARRCreator?
+## Can I contribute code to Distributed-OMEZarrCreator?
 
 Feel free!  We're always looking for ways to improve.
 
 ## Who made this?
 
-Distributed-OMEZARRCreator is a project from the [Cimini Lab](https://cimini-lab.broadinstitute.org) in the Imaging Platform at the Broad Institute in Cambridge, MA, USA.
+Distributed-OMEZarrCreator is a project from the [Cimini Lab](https://cimini-lab.broadinstitute.org) in the Imaging Platform at the Broad Institute in Cambridge, MA, USA.
diff --git a/documentation/DOZC-documentation/step_2_submit_jobs.md b/documentation/DOZC-documentation/step_2_submit_jobs.md
index 24a22b2..e67badb 100644
--- a/documentation/DOZC-documentation/step_2_submit_jobs.md
+++ b/documentation/DOZC-documentation/step_2_submit_jobs.md
@@ -48,6 +48,12 @@ Otherwise set to `false`.
 * **additional_flags:** Enter any additional flags you want passed to BioFormats2Raw.
 Otherwise set to `false`.
 (e.g. `--extra-readers com.glencoesoftware.bioformats2raw.MiraxReader --series 0,2,3,4`)
+* **downsample_after:** Enter `true` if you would like to add an additional, single downsampled resolution after your .ome.zarr files have been created.
+Otherwise set to `false`.
+* **downsample_only:** Enter `true` if you don't need to create .ome.zarr's but instead want to download them from the input_bucket and add a single downsampled resolution.
+Otherwise set to `false`.
+* **downsample_scale:** Enter a value for your single downsample resolution.
+(e.g. `8`)
 * **plates:** The list of all the plates you'd like to process.
 Each plate is an individual task and will be run in parallel.
 (e.g. `["PLATE1", "PLATE2", "PLATE3"]`)
diff --git a/files/exampleJob.json b/files/exampleJob.json
index 7baf40c..d615b4b 100644
--- a/files/exampleJob.json
+++ b/files/exampleJob.json
@@ -17,6 +17,11 @@
   "_comment_on_additional_flags": "Enter any additional flags you want passed to BioFormats2Raw. Otherwise set to false.",
   "_comment_on_additional_flags2": "e.g. --extra-readers com.glencoesoftware.bioformats2raw.MiraxReader",
   "additional_flags": false,
+  "_comment_on_downsample_after": "true if you want to add an additional downsampled resolution after .ome.zarr creation.",
+  "_comment_on_downsample_only": "true if you want to download already-created .ome.zarr files and downsample them.",
+  "downsample_after": false,
+  "downsample_only": false,
+  "downsample_scale": 8,
   "_comment_on_plates": "Each plate is an individual task and will be run in parallel",
   "plates": ["PLATEA","PLATEB","PLATEC","PLATED"]
 }
diff --git a/run.py b/run.py
index 7acb7e2..674c26f 100644
--- a/run.py
+++ b/run.py
@@ -97,7 +97,7 @@ def generate_task_definition(AWS_PROFILE):
             }]
 
     sqs = boto3.client('sqs')
-    queue_name = get_queue_url(sqs)
+    queue_name = get_queue_url(sqs, SQS_QUEUE_NAME)
     task_definition['containerDefinitions'][0]['environment'] += [
         {
             'name': 'APP_NAME',
@@ -194,6 +194,12 @@ def get_or_create_queue(sqs):
     else:
         print('Queue exists')
 
+def loadConfig(configFile):
+    data = None
+    with open(configFile, 'r') as conf:
+        data = json.load(conf)
+    return data
+
 def killdeadAlarms(fleetId,monitorapp,ec2,cloud):
     todel=[]
     changes = ec2.describe_spot_fleet_request_history(SpotFleetRequestId=fleetId,StartTime=(datetime.datetime.now()-datetime.timedelta(hours=2)).replace(microsecond=0))
@@ -556,7 +562,7 @@ def startCluster():
     createMonitor.write('"MONITOR_QUEUE_NAME" : "'+SQS_QUEUE_NAME+'",\n')
     createMonitor.write('"MONITOR_BUCKET_NAME" : "'+AWS_BUCKET+'",\n')
     createMonitor.write('"MONITOR_LOG_GROUP_NAME" : "'+LOG_GROUP_NAME+'",\n')
-    createMonitor.write('"MONITOR_START_TIME" : "'+ starttime+'"}\n')
+    createMonitor.write('"MONITOR_START_TIME" : "'+ starttime+'",\n')
     createMonitor.write('"CLEAN_DASHBOARD" : "'+ CLEAN_DASHBOARD+'"}\n')
     createMonitor.close()
 
diff --git a/worker/B2R-worker.py b/worker/B2R-worker.py
index 5ac6209..cf3d130 100644
--- a/worker/B2R-worker.py
+++ b/worker/B2R-worker.py
@@ -33,6 +33,8 @@
 else:
     DOWNLOAD_FILES = os.environ["DOWNLOAD_FILES"]
 
+local_root = "/home/ubuntu/local"
+os.makedirs(local_root, exist_ok=True)
 
 #################################
 # CLASS TO HANDLE THE SQS QUEUE
@@ -122,49 +124,87 @@ def runSomething(message):
                 return "SUCCESS"
         except KeyError:  # Returned if that folder does not exist
             pass
+    if "downsample_only" in message.keys():
+        if not message["downsample_only"]:
+            # Download files
+            printandlog("Downloading files", logger)
+            plate_path = os.path.join(message["input_location"], message["plate"])
+            local_plate_path = os.path.join(local_root, message["plate"])
+            os.makedirs(local_plate_path, exist_ok=True)
+
+            cmd = f'aws s3 cp s3://{message["input_bucket"]}/{plate_path} {local_plate_path} --recursive'
+            printandlog(f"Running {cmd}", logger)
+            logger.info(cmd)
+            subp = subprocess.Popen(
+                cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT
+            )
+            monitorAndLog(subp, logger)
+
+            # Build and run the program's command
+            # Use os.path.join to account for trailing slashes on inputs
+            flags = ""
+            if message["resolutions"]:
+                flags = flags + f" --resolutions {message['resolutions']}"
+            if message["tile_width"]:
+                flags = flags + f" --tile_width {message['tile_width']}"
+            if message["tile_height"]:
+                flags = flags + f" --tile_height {message['tile_height']}"
+            if message["target-min-size"]:
+                flags = flags + f" --target-min-size {message['target-min-size']}"
+            if message["additional_flags"]:
+                flags = flags + f" {message['additional_flags']}"
+            index_path = os.path.join(local_plate_path, message["path_to_metadata"])
+            zarr_path = os.path.join(local_root, f"{message['plate']}.ome.zarr")
+            cmd = (
+                f"/usr/local/bin/_entrypoint.sh bioformats2raw {index_path} {zarr_path} {flags}"
+            )
 
-    # Download files
-    printandlog("Downloading files", logger)
-    plate_path = os.path.join(message["input_location"], message["plate"])
-    local_root = "/home/ubuntu/local"
-    local_plate_path = os.path.join(local_root, message["plate"])
-    os.makedirs(local_plate_path, exist_ok=True)
-
-    cmd = f'aws s3 cp s3://{message["input_bucket"]}/{plate_path} {local_plate_path} --recursive'
-    printandlog(f"Running {cmd}", logger)
-    logger.info(cmd)
-    subp = subprocess.Popen(
-        cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT
-    )
-    monitorAndLog(subp, logger)
-
-    # Build and run the program's command
-    # Use os.path.join to account for trailing slashes on inputs
-    flags = ""
-    if message["resolutions"]:
-        flags = flags + f" --resolutions {message['resolutions']}"
-    if message["tile_width"]:
-        flags = flags + f" --tile_width {message['tile_width']}"
-    if message["tile_height"]:
-        flags = flags + f" --tile_height {message['tile_height']}"
-    if message["target-min-size"]:
-        flags = flags + f" --target-min-size {message['target-min-size']}"
-    if message["additional_flags"]:
-        flags = flags + f" {message['additional_flags']}"
-    index_path = os.path.join(local_plate_path, message["path_to_metadata"])
-    zarr_path = os.path.join(local_root, f"{message['plate']}.ome.zarr")
-    cmd = (
-        f"/usr/local/bin/_entrypoint.sh bioformats2raw {index_path} {zarr_path} {flags}"
-    )
+            printandlog(f"Running {cmd}", logger)
+            logger.info(cmd)
+            subp = subprocess.Popen(
+                cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT
+            )
+            monitorAndLog(subp, logger)
 
-    printandlog(f"Running {cmd}", logger)
-    logger.info(cmd)
-    subp = subprocess.Popen(
-        cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT
-    )
-    monitorAndLog(subp, logger)
+            printandlog("Finished with .ome.zarr creation.", logger)
+
+            # If adding downsample
+            if message["downsample_after"]:
+                cmd = (
+                    f"python3 add_downsampling.py {zarr_path} {message['downsample_scale']}"
+                )
+
+                printandlog(f"Downsampling. Running {cmd}", logger)
+                logger.info(cmd)
+                subp = subprocess.Popen(
+                    cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT
+                )
+                monitorAndLog(subp, logger)
+        else:
+            # Download .ome.zarr
+            printandlog("Downloading .ome.zarr", logger)
+            plate_path = os.path.join(message["input_location"], f'{message["plate"]}.ome.zarr')
+            zarr_path = os.path.join(local_root, f'{message["plate"]}.ome.zarr')
+
+            cmd = f'aws s3 cp s3://{message["input_bucket"]}/{plate_path} {zarr_path} --recursive'
+            printandlog(f"Running {cmd}", logger)
+            logger.info(cmd)
+            subp = subprocess.Popen(
+                cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT
+            )
+            monitorAndLog(subp, logger)
+
+            cmd = (
+                f"python3 add_downsampling.py {zarr_path} {message['downsample_scale']}"
+            )
+
+            printandlog(f"Downsampling. Running {cmd}", logger)
+            logger.info(cmd)
+            subp = subprocess.Popen(
+                cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT
+            )
+            monitorAndLog(subp, logger)
 
-    printandlog("Finished with .ome.zarr creation.", logger)
 
     # If done, get the outputs and move them to S3
     s3path = os.path.join(
diff --git a/worker/Dockerfile b/worker/Dockerfile
index e1ad47d..a75d0b5 100644
--- a/worker/Dockerfile
+++ b/worker/Dockerfile
@@ -6,7 +6,7 @@
 #
 
 
-FROM openmicroscopy/bioformats2raw:0.5.0
+FROM openmicroscopy/bioformats2raw:0.9.1
 
 # Install S3FS
 USER root
@@ -27,6 +27,9 @@ RUN apt-get -y update           && \
 
 RUN apt install -y python3.9-dev python3.9-distutils python3-pip
 
+# Install add_downsampling dependencies
+RUN python3.9 -m pip install dask zarr ome-zarr
+
 # Install AWS CLI
 RUN python3.9 -m pip install awscli
 
@@ -45,6 +48,7 @@ WORKDIR /home/ubuntu
 COPY B2R-worker.py .
 COPY instance-monitor.py .
 COPY run-worker.sh .
+COPY add_downsampling.py .
 RUN chmod 755 run-worker.sh
 
 WORKDIR /home/ubuntu