hrm_omero.hrm

Helper functions to interact with the HRM.

View Source
"""Helper functions to interact with the HRM."""

import os.path
import re
import shlex

from bs4 import BeautifulSoup
from loguru import logger as log


def parse_config(filename):
    """Assemble a dict from the HRM config file (shell syntax).

    Usually, the config is located at /etc/hrm.conf and written in shell syntax as
    this file simply gets sourced by the bash init script and other shell based
    tools.

    Parameters
    ----------
    filename : str
        The name of the configuration file to be parsed.

    Returns
    -------
    dict
        A dict with the parsed configuration items.

    Raises
    ------
    SyntaxError
        Raised in case the given configuration file can't be parsed correctly.

    Example
    -------
    >>> parse_config('/etc/hrm.conf')
    ... {
    ...     'HRM_DATA': '/export/hrm_data',
    ...     'HRM_DEST': 'dst',
    ...     'HRM_HOME': '/var/www/hrm',
    ...     'HRM_LOG': '/var/log/hrm',
    ...     'HRM_SOURCE': 'src',
    ...     'OMERO_HOSTNAME': 'omero.mynetwork.xy',
    ...     'OMERO_PKG': '/opt/OMERO/OMERO.server',
    ...     'OMERO_PORT': '4064',
    ...     'PHP_CLI': '/usr/local/php/bin/php',
    ...     'SUSER': 'hrm'
    ... }
    """
    log.debug(f"Trying to parse HRM configuration file [{filename}]...")
    config = dict()
    with open(filename, "r") as file:
        body = file.read()

    lexer = shlex.shlex(body)
    lexer.wordchars += "-./"
    while True:
        token = lexer.get_token()
        if token is None or token == "":
            break
        # it's valid sh syntax to use a semicolon to join lines, so accept it:
        if token == ";":
            continue
        # we assume entries of the following form:
        # KEY="some-value"
        key = token
        try:
            equals = lexer.get_token()
            assert equals == "="
        except AssertionError:
            raise SyntaxError(  # pylint: disable-msg=raise-missing-from
                f"Can't parse {filename}, invalid syntax in line {lexer.lineno} "
                f"(expected '=', found '{equals}')."
            )
        except Exception as err:  # pragma: no cover # pylint: disable-msg=broad-except
            log.warning(f"Error parsing config: {err}")
        value = lexer.get_token()
        value = value.replace('"', "")  # remove double quotes
        value = value.replace("'", "")  # remove single quotes
        config[key] = value
    log.info("Successfully parsed [{}].", filename)
    return config


def check_config(config):
    """Check the config dict for required entries.

    Parameters
    ----------
    config : dict
        A dict with a parsed configuration, as returned by `parse_hrm_conf()`.

    Raises
    ------
    SyntaxError
        Raised in case one of the required configuration items is missing.
    """
    required = ["OMERO_PKG", "OMERO_HOSTNAME"]
    for entry in required:
        if entry not in config:
            raise SyntaxError(f"Missing '{entry}'' in the HRM config file.")
    log.debug("HRM config file passed all checks.")


def job_parameter_summary(fname):
    """Generate a parameter summary text from the HRM-generated HTML file.

    Call the parser for the HTML file generated by the HRM containing the parameter
    summary and generate a plain-text version from the parsed results.

    Parameters
    ----------
    fname : str
        The filename of the job's HTML parameter summary.

    Returns
    -------
    str
        The formatted string containing the parameter summary.
    """
    parsed = parse_summary(fname)
    if parsed is None:
        return None

    summary = ""
    for section in parsed:
        summary += f"{section}\n==============================\n"
        for parameter in parsed[section]:
            summary += f"{parameter}: {parsed[section][parameter]}\n"
    log.debug(f"Job parameter summary:\n---\n{summary}---")
    log.success("Generated parameter summary.")
    return summary


def parse_job_basename(fname):
    """Parse the basename from an HRM job result file name.

    HRM job IDs are generated via PHP's `uniqid()` call that is giving a 13-digit
    hexadecimal string (8 digits UNIX time and 5 digits microsconds). The HRM labels its
    result files by appending an underscore (`_`) followed by this ID and an `_hrm`
    suffix. This function tries to match this section and remove everything *after* it
    from the name.

    Its intention is to safely remove the suffix from an image file name while taking no
    assumptions about how the suffix looks like (could e.g. be `.ics`, `.ome.tif` or
    similar).

    Parameters
    ----------
    fname : str
        The input string, usually the name of an HRM result file (but any string is
        accepted).

    Returns
    -------
    str
        The input string (`fname`) where everything *after* an HRM-like job label (e.g.
        `_abcdef0123456_hrm` or `_f435a27b9c85e_hrm`) is removed. In case the input
        string does *not* contain a matching section it is returned
    """
    log.trace(fname)
    basename = re.sub(r"(_[0-9a-f]{13}_hrm)\..*", r"\1", fname)
    log.trace(basename)
    return basename


def parse_summary(fname):
    """Parse the job parameter summary generated by HRM into a dict.

    Parse the HTML file generated by the HRM containing the parameter summary and
    generate a nested dict from it. The HTML file is assumed to contain three `<table>`
    items that contain a single `<td class="header">` item with the title and a `<tr>`
    section with four `<td>` items per parameter (being *parameter-name*, *channel*,
    *source* and *value*), e.g. something of this form:

    ```
    _____________________________________________
    |___________________title___________________|
    |_________________(ignored)_________________|
    | parameter-name | channel | source | value |
    ...
    | parameter-name | channel | source | value |
    ---------------------------------------------
    ```

    Parameters
    ----------
    fname : str
        The filename of the job's HTML parameter summary or (e.g.) the resulting image
        file. In case `fname` doesn't end in the common parameter summary suffix (for
        example if the image file name was provided), the function tries to derive the
        name of summary file and use that one for parsing.

    Returns
    -------
    dict(dict)
        A dict with the parsed section names (table titles) being the keys, each
        containing another dict with the parameter names as keys (including the channel
        unless the parameter is channel-independent). See the example below.

    Example
    -------
    >>> parse_summary('image_001.parameters.txt')
    ... {
    ...     "Image Parameters": {
    ...         "Emission wavelength (nm) [ch:0]": "567.000",
    ...         "Excitation wavelength (nm) [ch:0]": "456.000",
    ...         "Lens refractive index [ch:0]": "4.567",
    ...         "Microscope type [ch:0]": "widefield",
    ...         "Numerical aperture [ch:0]": "2.345",
    ...         "Point Spread Function": "theoretical",
    ...         "Sample refractive index [ch:0]": "3.456",
    ...         "Time interval (s)": "1.000000",
    ...         "X pixel size (μm)": "0.123456",
    ...         "Y pixel size (μm)": "0.123456",
    ...         "Z step size (μm)": "0.234567",
    ...     },
    ...     "Restoration Parameters": {
    ...         "Autocrop": "no",
    ...         "Background estimation": "auto",
    ...         "Deconvolution algorithm": "iiff",
    ...         "Number of iterations": "42",
    ...         "Quality stop criterion": "0.000007",
    ...         "Signal/Noise ratio [ch:0]": "99",
    ...     },
    ... }
    """
    # In case `fname` doesn't end with the common suffix for job summary files check if
    # it is the actual *image* filename of an HRM job and try to use the corresponding
    # parameter summary file instead:
    suffix = ".parameters.txt"
    if not fname.endswith(suffix):
        candidate = parse_job_basename(fname) + ".parameters.txt"
        if os.path.exists(candidate):
            log.debug(f"Found [{candidate}], will use it instead of [{fname}].")
            fname = candidate

    log.debug(f"Trying to parse job parameter summary file [{fname}]...")

    try:
        with open(fname, "r", encoding="utf-8") as soupfile:
            soup = BeautifulSoup(soupfile, features="html.parser")
    except IOError as err:
        log.error(f"Unable to open parameter summary file [{fname}]: {err}")
        return None

    sections = {}  # job parameter summaries have multiple sections split by headers
    rows = []
    for table in soup.findAll("table"):
        rows = table.findAll("tr")
        header = rows[0].findAll("td", class_="header")[0].text
        log.trace(f"Parsed table header: {header}")
        if header in sections:
            raise KeyError(f"Error parsing parameters, duplicate header: {header}")

        pairs = {}
        # and the table body, starting from the 3rd <tr> item:
        for row in rows[2:]:
            cols = row.findAll("td")
            # parse the parameter "name":
            param_key = cols[0].text
            log.trace(f"Parsed (raw) key name: {param_key}")
            # replace HTML-encoded chars:
            param_key = param_key.replace("&mu;m", "µm")

            # parse the channel and add it to the key-string (unless it's "All"):
            channel = cols[1].text
            if channel == "All":
                channel = ""
            else:
                channel = f" [ch:{channel}]"
            param_key += channel

            # parse the parameter value:
            param_value = cols[3].text

            # finally add a new entry to the dict unless the key already exists:
            if param_key in pairs:
                raise KeyError(f"Parsing failed, duplicate parameter: {param_key}")
            pairs[param_key] = param_value
        sections[header] = pairs

    log.success(f"Processed {len(rows)} table rows.")
    return sections
#   def parse_config(filename):
View Source
def parse_config(filename):
    """Assemble a dict from the HRM config file (shell syntax).

    Usually, the config is located at /etc/hrm.conf and written in shell syntax as
    this file simply gets sourced by the bash init script and other shell based
    tools.

    Parameters
    ----------
    filename : str
        The name of the configuration file to be parsed.

    Returns
    -------
    dict
        A dict with the parsed configuration items.

    Raises
    ------
    SyntaxError
        Raised in case the given configuration file can't be parsed correctly.

    Example
    -------
    >>> parse_config('/etc/hrm.conf')
    ... {
    ...     'HRM_DATA': '/export/hrm_data',
    ...     'HRM_DEST': 'dst',
    ...     'HRM_HOME': '/var/www/hrm',
    ...     'HRM_LOG': '/var/log/hrm',
    ...     'HRM_SOURCE': 'src',
    ...     'OMERO_HOSTNAME': 'omero.mynetwork.xy',
    ...     'OMERO_PKG': '/opt/OMERO/OMERO.server',
    ...     'OMERO_PORT': '4064',
    ...     'PHP_CLI': '/usr/local/php/bin/php',
    ...     'SUSER': 'hrm'
    ... }
    """
    log.debug(f"Trying to parse HRM configuration file [{filename}]...")
    config = dict()
    with open(filename, "r") as file:
        body = file.read()

    lexer = shlex.shlex(body)
    lexer.wordchars += "-./"
    while True:
        token = lexer.get_token()
        if token is None or token == "":
            break
        # it's valid sh syntax to use a semicolon to join lines, so accept it:
        if token == ";":
            continue
        # we assume entries of the following form:
        # KEY="some-value"
        key = token
        try:
            equals = lexer.get_token()
            assert equals == "="
        except AssertionError:
            raise SyntaxError(  # pylint: disable-msg=raise-missing-from
                f"Can't parse {filename}, invalid syntax in line {lexer.lineno} "
                f"(expected '=', found '{equals}')."
            )
        except Exception as err:  # pragma: no cover # pylint: disable-msg=broad-except
            log.warning(f"Error parsing config: {err}")
        value = lexer.get_token()
        value = value.replace('"', "")  # remove double quotes
        value = value.replace("'", "")  # remove single quotes
        config[key] = value
    log.info("Successfully parsed [{}].", filename)
    return config

Assemble a dict from the HRM config file (shell syntax).

Usually, the config is located at /etc/hrm.conf and written in shell syntax as this file simply gets sourced by the bash init script and other shell based tools.

Parameters
  • filename (str): The name of the configuration file to be parsed.
Returns
  • dict: A dict with the parsed configuration items.
Raises
  • SyntaxError: Raised in case the given configuration file can't be parsed correctly.
Example
>>> parse_config('/etc/hrm.conf')
... {
...     'HRM_DATA': '/export/hrm_data',
...     'HRM_DEST': 'dst',
...     'HRM_HOME': '/var/www/hrm',
...     'HRM_LOG': '/var/log/hrm',
...     'HRM_SOURCE': 'src',
...     'OMERO_HOSTNAME': 'omero.mynetwork.xy',
...     'OMERO_PKG': '/opt/OMERO/OMERO.server',
...     'OMERO_PORT': '4064',
...     'PHP_CLI': '/usr/local/php/bin/php',
...     'SUSER': 'hrm'
... }
#   def check_config(config):
View Source
def check_config(config):
    """Check the config dict for required entries.

    Parameters
    ----------
    config : dict
        A dict with a parsed configuration, as returned by `parse_hrm_conf()`.

    Raises
    ------
    SyntaxError
        Raised in case one of the required configuration items is missing.
    """
    required = ["OMERO_PKG", "OMERO_HOSTNAME"]
    for entry in required:
        if entry not in config:
            raise SyntaxError(f"Missing '{entry}'' in the HRM config file.")
    log.debug("HRM config file passed all checks.")

Check the config dict for required entries.

Parameters
  • config (dict): A dict with a parsed configuration, as returned by parse_hrm_conf().
Raises
  • SyntaxError: Raised in case one of the required configuration items is missing.
#   def job_parameter_summary(fname):
View Source
def job_parameter_summary(fname):
    """Generate a parameter summary text from the HRM-generated HTML file.

    Call the parser for the HTML file generated by the HRM containing the parameter
    summary and generate a plain-text version from the parsed results.

    Parameters
    ----------
    fname : str
        The filename of the job's HTML parameter summary.

    Returns
    -------
    str
        The formatted string containing the parameter summary.
    """
    parsed = parse_summary(fname)
    if parsed is None:
        return None

    summary = ""
    for section in parsed:
        summary += f"{section}\n==============================\n"
        for parameter in parsed[section]:
            summary += f"{parameter}: {parsed[section][parameter]}\n"
    log.debug(f"Job parameter summary:\n---\n{summary}---")
    log.success("Generated parameter summary.")
    return summary

Generate a parameter summary text from the HRM-generated HTML file.

Call the parser for the HTML file generated by the HRM containing the parameter summary and generate a plain-text version from the parsed results.

Parameters
  • fname (str): The filename of the job's HTML parameter summary.
Returns
  • str: The formatted string containing the parameter summary.
#   def parse_job_basename(fname):
View Source
def parse_job_basename(fname):
    """Parse the basename from an HRM job result file name.

    HRM job IDs are generated via PHP's `uniqid()` call that is giving a 13-digit
    hexadecimal string (8 digits UNIX time and 5 digits microsconds). The HRM labels its
    result files by appending an underscore (`_`) followed by this ID and an `_hrm`
    suffix. This function tries to match this section and remove everything *after* it
    from the name.

    Its intention is to safely remove the suffix from an image file name while taking no
    assumptions about how the suffix looks like (could e.g. be `.ics`, `.ome.tif` or
    similar).

    Parameters
    ----------
    fname : str
        The input string, usually the name of an HRM result file (but any string is
        accepted).

    Returns
    -------
    str
        The input string (`fname`) where everything *after* an HRM-like job label (e.g.
        `_abcdef0123456_hrm` or `_f435a27b9c85e_hrm`) is removed. In case the input
        string does *not* contain a matching section it is returned
    """
    log.trace(fname)
    basename = re.sub(r"(_[0-9a-f]{13}_hrm)\..*", r"\1", fname)
    log.trace(basename)
    return basename

Parse the basename from an HRM job result file name.

HRM job IDs are generated via PHP's uniqid() call that is giving a 13-digit hexadecimal string (8 digits UNIX time and 5 digits microsconds). The HRM labels its result files by appending an underscore (_) followed by this ID and an _hrm suffix. This function tries to match this section and remove everything after it from the name.

Its intention is to safely remove the suffix from an image file name while taking no assumptions about how the suffix looks like (could e.g. be .ics, .ome.tif or similar).

Parameters
  • fname (str): The input string, usually the name of an HRM result file (but any string is accepted).
Returns
  • str: The input string (fname) where everything after an HRM-like job label (e.g. _abcdef0123456_hrm or _f435a27b9c85e_hrm) is removed. In case the input string does not contain a matching section it is returned
#   def parse_summary(fname):
View Source
def parse_summary(fname):
    """Parse the job parameter summary generated by HRM into a dict.

    Parse the HTML file generated by the HRM containing the parameter summary and
    generate a nested dict from it. The HTML file is assumed to contain three `<table>`
    items that contain a single `<td class="header">` item with the title and a `<tr>`
    section with four `<td>` items per parameter (being *parameter-name*, *channel*,
    *source* and *value*), e.g. something of this form:

    ```
    _____________________________________________
    |___________________title___________________|
    |_________________(ignored)_________________|
    | parameter-name | channel | source | value |
    ...
    | parameter-name | channel | source | value |
    ---------------------------------------------
    ```

    Parameters
    ----------
    fname : str
        The filename of the job's HTML parameter summary or (e.g.) the resulting image
        file. In case `fname` doesn't end in the common parameter summary suffix (for
        example if the image file name was provided), the function tries to derive the
        name of summary file and use that one for parsing.

    Returns
    -------
    dict(dict)
        A dict with the parsed section names (table titles) being the keys, each
        containing another dict with the parameter names as keys (including the channel
        unless the parameter is channel-independent). See the example below.

    Example
    -------
    >>> parse_summary('image_001.parameters.txt')
    ... {
    ...     "Image Parameters": {
    ...         "Emission wavelength (nm) [ch:0]": "567.000",
    ...         "Excitation wavelength (nm) [ch:0]": "456.000",
    ...         "Lens refractive index [ch:0]": "4.567",
    ...         "Microscope type [ch:0]": "widefield",
    ...         "Numerical aperture [ch:0]": "2.345",
    ...         "Point Spread Function": "theoretical",
    ...         "Sample refractive index [ch:0]": "3.456",
    ...         "Time interval (s)": "1.000000",
    ...         "X pixel size (μm)": "0.123456",
    ...         "Y pixel size (μm)": "0.123456",
    ...         "Z step size (μm)": "0.234567",
    ...     },
    ...     "Restoration Parameters": {
    ...         "Autocrop": "no",
    ...         "Background estimation": "auto",
    ...         "Deconvolution algorithm": "iiff",
    ...         "Number of iterations": "42",
    ...         "Quality stop criterion": "0.000007",
    ...         "Signal/Noise ratio [ch:0]": "99",
    ...     },
    ... }
    """
    # In case `fname` doesn't end with the common suffix for job summary files check if
    # it is the actual *image* filename of an HRM job and try to use the corresponding
    # parameter summary file instead:
    suffix = ".parameters.txt"
    if not fname.endswith(suffix):
        candidate = parse_job_basename(fname) + ".parameters.txt"
        if os.path.exists(candidate):
            log.debug(f"Found [{candidate}], will use it instead of [{fname}].")
            fname = candidate

    log.debug(f"Trying to parse job parameter summary file [{fname}]...")

    try:
        with open(fname, "r", encoding="utf-8") as soupfile:
            soup = BeautifulSoup(soupfile, features="html.parser")
    except IOError as err:
        log.error(f"Unable to open parameter summary file [{fname}]: {err}")
        return None

    sections = {}  # job parameter summaries have multiple sections split by headers
    rows = []
    for table in soup.findAll("table"):
        rows = table.findAll("tr")
        header = rows[0].findAll("td", class_="header")[0].text
        log.trace(f"Parsed table header: {header}")
        if header in sections:
            raise KeyError(f"Error parsing parameters, duplicate header: {header}")

        pairs = {}
        # and the table body, starting from the 3rd <tr> item:
        for row in rows[2:]:
            cols = row.findAll("td")
            # parse the parameter "name":
            param_key = cols[0].text
            log.trace(f"Parsed (raw) key name: {param_key}")
            # replace HTML-encoded chars:
            param_key = param_key.replace("&mu;m", "µm")

            # parse the channel and add it to the key-string (unless it's "All"):
            channel = cols[1].text
            if channel == "All":
                channel = ""
            else:
                channel = f" [ch:{channel}]"
            param_key += channel

            # parse the parameter value:
            param_value = cols[3].text

            # finally add a new entry to the dict unless the key already exists:
            if param_key in pairs:
                raise KeyError(f"Parsing failed, duplicate parameter: {param_key}")
            pairs[param_key] = param_value
        sections[header] = pairs

    log.success(f"Processed {len(rows)} table rows.")
    return sections

Parse the job parameter summary generated by HRM into a dict.

Parse the HTML file generated by the HRM containing the parameter summary and generate a nested dict from it. The HTML file is assumed to contain three <table> items that contain a single <td class="header"> item with the title and a <tr> section with four <td> items per parameter (being parameter-name, channel, source and value), e.g. something of this form:

|___________________title___________________|
|_________________(ignored)_________________|
| parameter-name | channel | source | value |
...
| parameter-name | channel | source | value |
---------------------------------------------
Parameters
  • fname (str): The filename of the job's HTML parameter summary or (e.g.) the resulting image file. In case fname doesn't end in the common parameter summary suffix (for example if the image file name was provided), the function tries to derive the name of summary file and use that one for parsing.
Returns
  • dict(dict): A dict with the parsed section names (table titles) being the keys, each containing another dict with the parameter names as keys (including the channel unless the parameter is channel-independent). See the example below.
Example
>>> parse_summary('image_001.parameters.txt')
... {
...     "Image Parameters": {
...         "Emission wavelength (nm) [ch:0]": "567.000",
...         "Excitation wavelength (nm) [ch:0]": "456.000",
...         "Lens refractive index [ch:0]": "4.567",
...         "Microscope type [ch:0]": "widefield",
...         "Numerical aperture [ch:0]": "2.345",
...         "Point Spread Function": "theoretical",
...         "Sample refractive index [ch:0]": "3.456",
...         "Time interval (s)": "1.000000",
...         "X pixel size (μm)": "0.123456",
...         "Y pixel size (μm)": "0.123456",
...         "Z step size (μm)": "0.234567",
...     },
...     "Restoration Parameters": {
...         "Autocrop": "no",
...         "Background estimation": "auto",
...         "Deconvolution algorithm": "iiff",
...         "Number of iterations": "42",
...         "Quality stop criterion": "0.000007",
...         "Signal/Noise ratio [ch:0]": "99",
...     },
... }