hrm_omero.hrm

Helper functions to interact with the HRM.

View Source

  1"""Helper functions to interact with the HRM."""
  2
  3import os.path
  4import re
  5import shlex
  6
  7from bs4 import BeautifulSoup
  8from loguru import logger as log
  9
 10
 11def parse_config(filename):
 12    """Assemble a dict from the HRM config file (shell syntax).
 13
 14    Usually, the config is located at /etc/hrm.conf and written in shell syntax as
 15    this file simply gets sourced by the bash init script and other shell based
 16    tools.
 17
 18    Parameters
 19    ----------
 20    filename : str
 21        The name of the configuration file to be parsed.
 22
 23    Returns
 24    -------
 25    dict
 26        A dict with the parsed configuration items.
 27
 28    Raises
 29    ------
 30    SyntaxError
 31        Raised in case the given configuration file can't be parsed correctly.
 32
 33    Example
 34    -------
 35    >>> parse_config('/etc/hrm.conf')
 36    ... {
 37    ...     'HRM_DATA': '/export/hrm_data',
 38    ...     'HRM_DEST': 'dst',
 39    ...     'HRM_HOME': '/var/www/hrm',
 40    ...     'HRM_LOG': '/var/log/hrm',
 41    ...     'HRM_SOURCE': 'src',
 42    ...     'OMERO_HOSTNAME': 'omero.mynetwork.xy',
 43    ...     'OMERO_PKG': '/opt/OMERO/OMERO.server',
 44    ...     'OMERO_PORT': '4064',
 45    ...     'PHP_CLI': '/usr/local/php/bin/php',
 46    ...     'SUSER': 'hrm'
 47    ... }
 48    """
 49    log.debug(f"Trying to parse HRM configuration file [{filename}]...")
 50    config = dict()
 51    with open(filename, "r", encoding="utf-8") as file:
 52        body = file.read()
 53
 54    lexer = shlex.shlex(body)
 55    lexer.wordchars += "-./"
 56    while True:
 57        token = lexer.get_token()
 58        if token is None or token == "":
 59            break
 60        # it's valid sh syntax to use a semicolon to join lines, so accept it:
 61        if token == ";":
 62            continue
 63        # we assume entries of the following form:
 64        # KEY="some-value"
 65        key = token
 66        try:
 67            equals = lexer.get_token()
 68            assert equals == "="
 69        except AssertionError:
 70            raise SyntaxError(  # pylint: disable-msg=raise-missing-from
 71                f"Can't parse {filename}, invalid syntax in line {lexer.lineno} "
 72                f"(expected '=', found '{equals}')."
 73            )
 74        except Exception as err:  # pragma: no cover # pylint: disable-msg=broad-except
 75            log.warning(f"Error parsing config: {err}")
 76        value = lexer.get_token()
 77        value = value.replace('"', "")  # remove double quotes
 78        value = value.replace("'", "")  # remove single quotes
 79        config[key] = value
 80    log.info("Successfully parsed [{}].", filename)
 81    return config
 82
 83
 84def check_config(config):
 85    """Check the config dict for required entries.
 86
 87    Parameters
 88    ----------
 89    config : dict
 90        A dict with a parsed configuration, as returned by `parse_hrm_conf()`.
 91
 92    Raises
 93    ------
 94    SyntaxError
 95        Raised in case one of the required configuration items is missing.
 96    """
 97    required = ["OMERO_PKG", "OMERO_HOSTNAME"]
 98    for entry in required:
 99        if entry not in config:
100            raise SyntaxError(f"Missing '{entry}'' in the HRM config file.")
101    log.debug("HRM config file passed all checks.")
102
103
104def job_parameter_summary(fname):
105    """Generate a parameter summary text from the HRM-generated HTML file.
106
107    Call the parser for the HTML file generated by the HRM containing the parameter
108    summary and generate a plain-text version from the parsed results.
109
110    Parameters
111    ----------
112    fname : str
113        The filename of the job's HTML parameter summary.
114
115    Returns
116    -------
117    str
118        The formatted string containing the parameter summary.
119    """
120    parsed = parse_summary(fname)
121    if parsed is None:
122        return None
123
124    summary = ""
125    for section in parsed:
126        summary += f"{section}\n==============================\n"
127        for parameter in parsed[section]:
128            summary += f"{parameter}: {parsed[section][parameter]}\n"
129    log.debug(f"Job parameter summary:\n---\n{summary}---")
130    log.success("Generated parameter summary.")
131    return summary
132
133
134def parse_job_basename(fname):
135    """Parse the basename from an HRM job result file name.
136
137    HRM job IDs are generated via PHP's `uniqid()` call that is giving a 13-digit
138    hexadecimal string (8 digits UNIX time and 5 digits microsconds). The HRM labels its
139    result files by appending an underscore (`_`) followed by this ID and an `_hrm`
140    suffix. This function tries to match this section and remove everything *after* it
141    from the name.
142
143    Its intention is to safely remove the suffix from an image file name while taking no
144    assumptions about how the suffix looks like (could e.g. be `.ics`, `.ome.tif` or
145    similar).
146
147    Parameters
148    ----------
149    fname : str
150        The input string, usually the name of an HRM result file (but any string is
151        accepted).
152
153    Returns
154    -------
155    str
156        The input string (`fname`) where everything *after* an HRM-like job label (e.g.
157        `_abcdef0123456_hrm` or `_f435a27b9c85e_hrm`) is removed. In case the input
158        string does *not* contain a matching section it is returned
159    """
160    log.trace(fname)
161    basename = re.sub(r"(_[0-9a-f]{13}_hrm)\..*", r"\1", fname)
162    log.trace(basename)
163    return basename
164
165
166def parse_summary(fname):
167    """Parse the job parameter summary generated by HRM into a dict.
168
169    Parse the HTML file generated by the HRM containing the parameter summary and
170    generate a nested dict from it. The HTML file is assumed to contain three `<table>`
171    items that contain a single `<td class="header">` item with the title and a `<tr>`
172    section with four `<td>` items per parameter (being *parameter-name*, *channel*,
173    *source* and *value*), e.g. something of this form:
174
175    ```
176    _____________________________________________
177    |___________________title___________________|
178    |_________________(ignored)_________________|
179    | parameter-name | channel | source | value |
180    ...
181    | parameter-name | channel | source | value |
182    ---------------------------------------------
183    ```
184
185    Parameters
186    ----------
187    fname : str
188        The filename of the job's HTML parameter summary or (e.g.) the resulting image
189        file. In case `fname` doesn't end in the common parameter summary suffix (for
190        example if the image file name was provided), the function tries to derive the
191        name of summary file and use that one for parsing.
192
193    Returns
194    -------
195    dict(dict)
196        A dict with the parsed section names (table titles) being the keys, each
197        containing another dict with the parameter names as keys (including the channel
198        unless the parameter is channel-independent). See the example below.
199
200    Example
201    -------
202    >>> parse_summary('image_001.parameters.txt')
203    ... {
204    ...     "Image Parameters": {
205    ...         "Emission wavelength (nm) [ch:0]": "567.000",
206    ...         "Excitation wavelength (nm) [ch:0]": "456.000",
207    ...         "Lens refractive index [ch:0]": "4.567",
208    ...         "Microscope type [ch:0]": "widefield",
209    ...         "Numerical aperture [ch:0]": "2.345",
210    ...         "Point Spread Function": "theoretical",
211    ...         "Sample refractive index [ch:0]": "3.456",
212    ...         "Time interval (s)": "1.000000",
213    ...         "X pixel size (μm)": "0.123456",
214    ...         "Y pixel size (μm)": "0.123456",
215    ...         "Z step size (μm)": "0.234567",
216    ...     },
217    ...     "Restoration Parameters": {
218    ...         "Autocrop": "no",
219    ...         "Background estimation": "auto",
220    ...         "Deconvolution algorithm": "iiff",
221    ...         "Number of iterations": "42",
222    ...         "Quality stop criterion": "0.000007",
223    ...         "Signal/Noise ratio [ch:0]": "99",
224    ...     },
225    ... }
226    """
227    # In case `fname` doesn't end with the common suffix for job summary files check if
228    # it is the actual *image* filename of an HRM job and try to use the corresponding
229    # parameter summary file instead:
230    suffix = ".parameters.txt"
231    if not fname.endswith(suffix):
232        candidate = parse_job_basename(fname) + ".parameters.txt"
233        if os.path.exists(candidate):
234            log.debug(f"Found [{candidate}], will use it instead of [{fname}].")
235            fname = candidate
236
237    log.debug(f"Trying to parse job parameter summary file [{fname}]...")
238
239    try:
240        with open(fname, "r", encoding="utf-8") as soupfile:
241            soup = BeautifulSoup(soupfile, features="html.parser")
242            log.trace(f"BeautifulSoup successfully parsed [{fname}].")
243    except IOError as err:
244        log.error(f"Unable to open parameter summary file [{fname}]: {err}")
245        return None
246    except Exception as err:  # pragma: no cover  # pylint: disable-msg=broad-except
247        log.error(f"Parsing summary file [{fname}] failed: {err}")
248        return None
249
250    sections = {}  # job parameter summaries have multiple sections split by headers
251    rows = []
252    for table in soup.findAll("table"):
253        log.trace("Parsing table header...")
254        try:
255            rows = table.findAll("tr")
256            header = rows[0].findAll("td", class_="header")[0].text
257        except Exception:  # pylint: disable-msg=broad-except
258            log.debug("Skipping table entry that doesn't have a header.")
259            continue
260        log.trace(f"Parsed table header: {header}")
261        if header in sections:
262            raise KeyError(f"Error parsing parameters, duplicate header: {header}")
263
264        pairs = {}
265        # and the table body, starting from the 3rd <tr> item:
266        for row in rows[2:]:
267            cols = row.findAll("td")
268            # parse the parameter "name":
269            param_key = cols[0].text
270            log.trace(f"Parsed (raw) key name: {param_key}")
271            # replace HTML-encoded chars:
272            param_key = param_key.replace("&mu;m", "µm")
273
274            # parse the channel and add it to the key-string (unless it's "All"):
275            channel = cols[1].text
276            if channel == "All":
277                channel = ""
278            else:
279                channel = f" [ch:{channel}]"
280            param_key += channel
281
282            # parse the parameter value:
283            param_value = cols[3].text
284
285            # finally add a new entry to the dict unless the key already exists:
286            if param_key in pairs:
287                raise KeyError(f"Parsing failed, duplicate parameter: {param_key}")
288            pairs[param_key] = param_value
289        sections[header] = pairs
290
291    log.success(f"Processed {len(rows)} table rows.")
292    return sections

def parse_config(filename): View Source

12def parse_config(filename):
13    """Assemble a dict from the HRM config file (shell syntax).
14
15    Usually, the config is located at /etc/hrm.conf and written in shell syntax as
16    this file simply gets sourced by the bash init script and other shell based
17    tools.
18
19    Parameters
20    ----------
21    filename : str
22        The name of the configuration file to be parsed.
23
24    Returns
25    -------
26    dict
27        A dict with the parsed configuration items.
28
29    Raises
30    ------
31    SyntaxError
32        Raised in case the given configuration file can't be parsed correctly.
33
34    Example
35    -------
36    >>> parse_config('/etc/hrm.conf')
37    ... {
38    ...     'HRM_DATA': '/export/hrm_data',
39    ...     'HRM_DEST': 'dst',
40    ...     'HRM_HOME': '/var/www/hrm',
41    ...     'HRM_LOG': '/var/log/hrm',
42    ...     'HRM_SOURCE': 'src',
43    ...     'OMERO_HOSTNAME': 'omero.mynetwork.xy',
44    ...     'OMERO_PKG': '/opt/OMERO/OMERO.server',
45    ...     'OMERO_PORT': '4064',
46    ...     'PHP_CLI': '/usr/local/php/bin/php',
47    ...     'SUSER': 'hrm'
48    ... }
49    """
50    log.debug(f"Trying to parse HRM configuration file [{filename}]...")
51    config = dict()
52    with open(filename, "r", encoding="utf-8") as file:
53        body = file.read()
54
55    lexer = shlex.shlex(body)
56    lexer.wordchars += "-./"
57    while True:
58        token = lexer.get_token()
59        if token is None or token == "":
60            break
61        # it's valid sh syntax to use a semicolon to join lines, so accept it:
62        if token == ";":
63            continue
64        # we assume entries of the following form:
65        # KEY="some-value"
66        key = token
67        try:
68            equals = lexer.get_token()
69            assert equals == "="
70        except AssertionError:
71            raise SyntaxError(  # pylint: disable-msg=raise-missing-from
72                f"Can't parse {filename}, invalid syntax in line {lexer.lineno} "
73                f"(expected '=', found '{equals}')."
74            )
75        except Exception as err:  # pragma: no cover # pylint: disable-msg=broad-except
76            log.warning(f"Error parsing config: {err}")
77        value = lexer.get_token()
78        value = value.replace('"', "")  # remove double quotes
79        value = value.replace("'", "")  # remove single quotes
80        config[key] = value
81    log.info("Successfully parsed [{}].", filename)
82    return config

Assemble a dict from the HRM config file (shell syntax).

Usually, the config is located at /etc/hrm.conf and written in shell syntax as this file simply gets sourced by the bash init script and other shell based tools.

Parameters

filename (str): The name of the configuration file to be parsed.

Returns

dict: A dict with the parsed configuration items.

Raises

SyntaxError: Raised in case the given configuration file can't be parsed correctly.

Example

>>> parse_config('/etc/hrm.conf')
... {
...     'HRM_DATA': '/export/hrm_data',
...     'HRM_DEST': 'dst',
...     'HRM_HOME': '/var/www/hrm',
...     'HRM_LOG': '/var/log/hrm',
...     'HRM_SOURCE': 'src',
...     'OMERO_HOSTNAME': 'omero.mynetwork.xy',
...     'OMERO_PKG': '/opt/OMERO/OMERO.server',
...     'OMERO_PORT': '4064',
...     'PHP_CLI': '/usr/local/php/bin/php',
...     'SUSER': 'hrm'
... }

def check_config(config): View Source

 85def check_config(config):
 86    """Check the config dict for required entries.
 87
 88    Parameters
 89    ----------
 90    config : dict
 91        A dict with a parsed configuration, as returned by `parse_hrm_conf()`.
 92
 93    Raises
 94    ------
 95    SyntaxError
 96        Raised in case one of the required configuration items is missing.
 97    """
 98    required = ["OMERO_PKG", "OMERO_HOSTNAME"]
 99    for entry in required:
100        if entry not in config:
101            raise SyntaxError(f"Missing '{entry}'' in the HRM config file.")
102    log.debug("HRM config file passed all checks.")

Check the config dict for required entries.

Parameters

config (dict): A dict with a parsed configuration, as returned by parse_hrm_conf().

Raises

SyntaxError: Raised in case one of the required configuration items is missing.

def job_parameter_summary(fname): View Source

105def job_parameter_summary(fname):
106    """Generate a parameter summary text from the HRM-generated HTML file.
107
108    Call the parser for the HTML file generated by the HRM containing the parameter
109    summary and generate a plain-text version from the parsed results.
110
111    Parameters
112    ----------
113    fname : str
114        The filename of the job's HTML parameter summary.
115
116    Returns
117    -------
118    str
119        The formatted string containing the parameter summary.
120    """
121    parsed = parse_summary(fname)
122    if parsed is None:
123        return None
124
125    summary = ""
126    for section in parsed:
127        summary += f"{section}\n==============================\n"
128        for parameter in parsed[section]:
129            summary += f"{parameter}: {parsed[section][parameter]}\n"
130    log.debug(f"Job parameter summary:\n---\n{summary}---")
131    log.success("Generated parameter summary.")
132    return summary

Generate a parameter summary text from the HRM-generated HTML file.

Call the parser for the HTML file generated by the HRM containing the parameter summary and generate a plain-text version from the parsed results.

Parameters

fname (str): The filename of the job's HTML parameter summary.

Returns

str: The formatted string containing the parameter summary.

def parse_job_basename(fname): View Source

135def parse_job_basename(fname):
136    """Parse the basename from an HRM job result file name.
137
138    HRM job IDs are generated via PHP's `uniqid()` call that is giving a 13-digit
139    hexadecimal string (8 digits UNIX time and 5 digits microsconds). The HRM labels its
140    result files by appending an underscore (`_`) followed by this ID and an `_hrm`
141    suffix. This function tries to match this section and remove everything *after* it
142    from the name.
143
144    Its intention is to safely remove the suffix from an image file name while taking no
145    assumptions about how the suffix looks like (could e.g. be `.ics`, `.ome.tif` or
146    similar).
147
148    Parameters
149    ----------
150    fname : str
151        The input string, usually the name of an HRM result file (but any string is
152        accepted).
153
154    Returns
155    -------
156    str
157        The input string (`fname`) where everything *after* an HRM-like job label (e.g.
158        `_abcdef0123456_hrm` or `_f435a27b9c85e_hrm`) is removed. In case the input
159        string does *not* contain a matching section it is returned
160    """
161    log.trace(fname)
162    basename = re.sub(r"(_[0-9a-f]{13}_hrm)\..*", r"\1", fname)
163    log.trace(basename)
164    return basename

Parse the basename from an HRM job result file name.

HRM job IDs are generated via PHP's uniqid() call that is giving a 13-digit hexadecimal string (8 digits UNIX time and 5 digits microsconds). The HRM labels its result files by appending an underscore (_) followed by this ID and an _hrm suffix. This function tries to match this section and remove everything after it from the name.

Its intention is to safely remove the suffix from an image file name while taking no assumptions about how the suffix looks like (could e.g. be .ics, .ome.tif or similar).

Parameters

fname (str): The input string, usually the name of an HRM result file (but any string is accepted).

Returns

str: The input string (fname) where everything after an HRM-like job label (e.g. _abcdef0123456_hrm or _f435a27b9c85e_hrm) is removed. In case the input string does not contain a matching section it is returned

def parse_summary(fname): View Source

167def parse_summary(fname):
168    """Parse the job parameter summary generated by HRM into a dict.
169
170    Parse the HTML file generated by the HRM containing the parameter summary and
171    generate a nested dict from it. The HTML file is assumed to contain three `<table>`
172    items that contain a single `<td class="header">` item with the title and a `<tr>`
173    section with four `<td>` items per parameter (being *parameter-name*, *channel*,
174    *source* and *value*), e.g. something of this form:
175
176    ```
177    _____________________________________________
178    |___________________title___________________|
179    |_________________(ignored)_________________|
180    | parameter-name | channel | source | value |
181    ...
182    | parameter-name | channel | source | value |
183    ---------------------------------------------
184    ```
185
186    Parameters
187    ----------
188    fname : str
189        The filename of the job's HTML parameter summary or (e.g.) the resulting image
190        file. In case `fname` doesn't end in the common parameter summary suffix (for
191        example if the image file name was provided), the function tries to derive the
192        name of summary file and use that one for parsing.
193
194    Returns
195    -------
196    dict(dict)
197        A dict with the parsed section names (table titles) being the keys, each
198        containing another dict with the parameter names as keys (including the channel
199        unless the parameter is channel-independent). See the example below.
200
201    Example
202    -------
203    >>> parse_summary('image_001.parameters.txt')
204    ... {
205    ...     "Image Parameters": {
206    ...         "Emission wavelength (nm) [ch:0]": "567.000",
207    ...         "Excitation wavelength (nm) [ch:0]": "456.000",
208    ...         "Lens refractive index [ch:0]": "4.567",
209    ...         "Microscope type [ch:0]": "widefield",
210    ...         "Numerical aperture [ch:0]": "2.345",
211    ...         "Point Spread Function": "theoretical",
212    ...         "Sample refractive index [ch:0]": "3.456",
213    ...         "Time interval (s)": "1.000000",
214    ...         "X pixel size (μm)": "0.123456",
215    ...         "Y pixel size (μm)": "0.123456",
216    ...         "Z step size (μm)": "0.234567",
217    ...     },
218    ...     "Restoration Parameters": {
219    ...         "Autocrop": "no",
220    ...         "Background estimation": "auto",
221    ...         "Deconvolution algorithm": "iiff",
222    ...         "Number of iterations": "42",
223    ...         "Quality stop criterion": "0.000007",
224    ...         "Signal/Noise ratio [ch:0]": "99",
225    ...     },
226    ... }
227    """
228    # In case `fname` doesn't end with the common suffix for job summary files check if
229    # it is the actual *image* filename of an HRM job and try to use the corresponding
230    # parameter summary file instead:
231    suffix = ".parameters.txt"
232    if not fname.endswith(suffix):
233        candidate = parse_job_basename(fname) + ".parameters.txt"
234        if os.path.exists(candidate):
235            log.debug(f"Found [{candidate}], will use it instead of [{fname}].")
236            fname = candidate
237
238    log.debug(f"Trying to parse job parameter summary file [{fname}]...")
239
240    try:
241        with open(fname, "r", encoding="utf-8") as soupfile:
242            soup = BeautifulSoup(soupfile, features="html.parser")
243            log.trace(f"BeautifulSoup successfully parsed [{fname}].")
244    except IOError as err:
245        log.error(f"Unable to open parameter summary file [{fname}]: {err}")
246        return None
247    except Exception as err:  # pragma: no cover  # pylint: disable-msg=broad-except
248        log.error(f"Parsing summary file [{fname}] failed: {err}")
249        return None
250
251    sections = {}  # job parameter summaries have multiple sections split by headers
252    rows = []
253    for table in soup.findAll("table"):
254        log.trace("Parsing table header...")
255        try:
256            rows = table.findAll("tr")
257            header = rows[0].findAll("td", class_="header")[0].text
258        except Exception:  # pylint: disable-msg=broad-except
259            log.debug("Skipping table entry that doesn't have a header.")
260            continue
261        log.trace(f"Parsed table header: {header}")
262        if header in sections:
263            raise KeyError(f"Error parsing parameters, duplicate header: {header}")
264
265        pairs = {}
266        # and the table body, starting from the 3rd <tr> item:
267        for row in rows[2:]:
268            cols = row.findAll("td")
269            # parse the parameter "name":
270            param_key = cols[0].text
271            log.trace(f"Parsed (raw) key name: {param_key}")
272            # replace HTML-encoded chars:
273            param_key = param_key.replace("&mu;m", "µm")
274
275            # parse the channel and add it to the key-string (unless it's "All"):
276            channel = cols[1].text
277            if channel == "All":
278                channel = ""
279            else:
280                channel = f" [ch:{channel}]"
281            param_key += channel
282
283            # parse the parameter value:
284            param_value = cols[3].text
285
286            # finally add a new entry to the dict unless the key already exists:
287            if param_key in pairs:
288                raise KeyError(f"Parsing failed, duplicate parameter: {param_key}")
289            pairs[param_key] = param_value
290        sections[header] = pairs
291
292    log.success(f"Processed {len(rows)} table rows.")
293    return sections

Parse the job parameter summary generated by HRM into a dict.

Parse the HTML file generated by the HRM containing the parameter summary and generate a nested dict from it. The HTML file is assumed to contain three <table> items that contain a single <td class="header"> item with the title and a <tr> section with four <td> items per parameter (being parameter-name, channel, source and value), e.g. something of this form:

_____________________________________________
|___________________title___________________|
|_________________(ignored)_________________|
| parameter-name | channel | source | value |
...
| parameter-name | channel | source | value |
---------------------------------------------

Parameters

fname (str): The filename of the job's HTML parameter summary or (e.g.) the resulting image file. In case fname doesn't end in the common parameter summary suffix (for example if the image file name was provided), the function tries to derive the name of summary file and use that one for parsing.

Returns

dict(dict): A dict with the parsed section names (table titles) being the keys, each containing another dict with the parameter names as keys (including the channel unless the parameter is channel-independent). See the example below.

Example

>>> parse_summary('image_001.parameters.txt')
... {
...     "Image Parameters": {
...         "Emission wavelength (nm) [ch:0]": "567.000",
...         "Excitation wavelength (nm) [ch:0]": "456.000",
...         "Lens refractive index [ch:0]": "4.567",
...         "Microscope type [ch:0]": "widefield",
...         "Numerical aperture [ch:0]": "2.345",
...         "Point Spread Function": "theoretical",
...         "Sample refractive index [ch:0]": "3.456",
...         "Time interval (s)": "1.000000",
...         "X pixel size (μm)": "0.123456",
...         "Y pixel size (μm)": "0.123456",
...         "Z step size (μm)": "0.234567",
...     },
...     "Restoration Parameters": {
...         "Autocrop": "no",
...         "Background estimation": "auto",
...         "Deconvolution algorithm": "iiff",
...         "Number of iterations": "42",
...         "Quality stop criterion": "0.000007",
...         "Signal/Noise ratio [ch:0]": "99",
...     },
... }