Source code for standard_names.cmd.snscrape

#! /usr/bin/env python
"""
Example usage:
    snscrape http://csdms.colorado.edu/wiki/CSN_Quantity_Templates \
             http://csdms.colorado.edu/wiki/CSN_Object_Templates \
             http://csdms.colorado.edu/wiki/CSN_Operation_Templates \
            > data/scraped.yaml
"""
from __future__ import print_function

import os

from ..utilities import FORMATTERS, SCRAPERS, scrape


_AS_TXT = FORMATTERS["txt"]

_DEFAULT_SEARCH = r"\b[\w~-]+__[\w~-]+"


[docs]def snscrape(files, with_headers=False, regex=None, format="url", newline=None):
    """Scrape names from a URL.

    Parameters
    ----------
    files : iterable of str
        List of files or URL to scrape.
    with_headers : bool, optional
        Include headers in the output that indicate the name of the source.
    regex : str, optional
        A regular expression that defines what a Standard Name is.
    format : {'url', 'plain_text'}, optional
        The format of the target that's being scraped.
    newline : str, optional
        Newline character to use for output.

    Returns
    -------
    str
        The scraped names.

    Examples
    --------
    >>> from __future__ import print_function
    >>> from six.moves import StringIO
    >>> import standard_names as csn

    >>> file1 = StringIO(\"\"\"
    ... A file is one name, which is air__temperature.
    ... \"\"\")
    >>> file2 = StringIO(\"\"\"
    ... A file is two names: air__temperature, and water__temperature.
    ... \"\"\")

    >>> lines = csn.cmd.snscrape.snscrape([file1, file2], format='plain_text')
    >>> sorted(lines.split(os.linesep))
    ['air__temperature', 'air__temperature', 'water__temperature']
    """
    newline = newline or os.linesep
    regex = regex or _DEFAULT_SEARCH

    docs = {}
    for file_name in files:
        docs[file_name] = scrape(file_name, regex=regex, format=format)

    documents = []
    for (name, name_list) in docs.items():
        if with_headers:
            heading = "Scraped from %s" % name
        else:
            heading = None
        documents.append(_AS_TXT(name_list, sorted=True, heading=heading))

    return newline.join(documents)


[docs]def main(args=None):
    """Scrape standard names from a file or URL.

    Examples
    --------
    >>> import os
    >>> import tempfile
    >>> import standard_names as csn

    >>> contents = \"\"\"
    ... A file with text and names (air__temperature) mixed in. Some names
    ... have double underscores (like, Water__Temperature) by are not
    ... valid names. Others, like water__temperature, are good.
    ... \"\"\"

    >>> (fd, fname) = tempfile.mkstemp()
    >>> os.close(fd)

    >>> with open(fname, 'w') as fp:
    ...     print(contents, file=fp)

    >>> names = csn.cmd.snscrape.main(
    ...     [fp.name, '--reader=plain_text', '--no-headers'])
    >>> names.split(os.linesep)
    ['air__temperature', 'water__temperature']

    >>> os.remove(fname)
    """
    import argparse

    parser = argparse.ArgumentParser("Scrape standard names from a file or URL")
    parser.add_argument("file", nargs="+", metavar="FILE", help="URL or file to scrape")
    parser.add_argument(
        "--reader", choices=SCRAPERS.keys(), default="url", help="Name of reader"
    )
    parser.add_argument(
        "--regex",
        default=_DEFAULT_SEARCH,
        help="Regular expression describing " "a standard name (%s)" % _DEFAULT_SEARCH,
    )
    parser.add_argument(
        "--no-headers", action="store_true", help="Do not print headers between scrapes"
    )

    if args is None:
        args = parser.parse_args()
    else:
        args = parser.parse_args(args)

    return snscrape(
        args.file,
        with_headers=not args.no_headers,
        regex=args.regex,
        format=args.reader,
    )


[docs]def run():
    print(main())
Source code for standard_names.cmd.snscrape

standard_names

Navigation

Related Topics