Source code for standard_names.cmd.snscrape

#! /usr/bin/env python
"""
Example usage:
    snscrape http://csdms.colorado.edu/wiki/CSN_Quantity_Templates \
             http://csdms.colorado.edu/wiki/CSN_Object_Templates \
             http://csdms.colorado.edu/wiki/CSN_Operation_Templates \
            > data/scraped.yaml
"""
from __future__ import print_function

import os

from ..utilities import FORMATTERS, SCRAPERS, scrape


_AS_TXT = FORMATTERS["txt"]

_DEFAULT_SEARCH = r"\b[\w~-]+__[\w~-]+"


[docs]def snscrape(files, with_headers=False, regex=None, format="url", newline=None): """Scrape names from a URL. Parameters ---------- files : iterable of str List of files or URL to scrape. with_headers : bool, optional Include headers in the output that indicate the name of the source. regex : str, optional A regular expression that defines what a Standard Name is. format : {'url', 'plain_text'}, optional The format of the target that's being scraped. newline : str, optional Newline character to use for output. Returns ------- str The scraped names. Examples -------- >>> from __future__ import print_function >>> from six.moves import StringIO >>> import standard_names as csn >>> file1 = StringIO(\"\"\" ... A file is one name, which is air__temperature. ... \"\"\") >>> file2 = StringIO(\"\"\" ... A file is two names: air__temperature, and water__temperature. ... \"\"\") >>> lines = csn.cmd.snscrape.snscrape([file1, file2], format='plain_text') >>> sorted(lines.split(os.linesep)) ['air__temperature', 'air__temperature', 'water__temperature'] """ newline = newline or os.linesep regex = regex or _DEFAULT_SEARCH docs = {} for file_name in files: docs[file_name] = scrape(file_name, regex=regex, format=format) documents = [] for (name, name_list) in docs.items(): if with_headers: heading = "Scraped from %s" % name else: heading = None documents.append(_AS_TXT(name_list, sorted=True, heading=heading)) return newline.join(documents)
[docs]def main(args=None): """Scrape standard names from a file or URL. Examples -------- >>> import os >>> import tempfile >>> import standard_names as csn >>> contents = \"\"\" ... A file with text and names (air__temperature) mixed in. Some names ... have double underscores (like, Water__Temperature) by are not ... valid names. Others, like water__temperature, are good. ... \"\"\" >>> (fd, fname) = tempfile.mkstemp() >>> os.close(fd) >>> with open(fname, 'w') as fp: ... print(contents, file=fp) >>> names = csn.cmd.snscrape.main( ... [fp.name, '--reader=plain_text', '--no-headers']) >>> names.split(os.linesep) ['air__temperature', 'water__temperature'] >>> os.remove(fname) """ import argparse parser = argparse.ArgumentParser("Scrape standard names from a file or URL") parser.add_argument("file", nargs="+", metavar="FILE", help="URL or file to scrape") parser.add_argument( "--reader", choices=SCRAPERS.keys(), default="url", help="Name of reader" ) parser.add_argument( "--regex", default=_DEFAULT_SEARCH, help="Regular expression describing " "a standard name (%s)" % _DEFAULT_SEARCH, ) parser.add_argument( "--no-headers", action="store_true", help="Do not print headers between scrapes" ) if args is None: args = parser.parse_args() else: args = parser.parse_args(args) return snscrape( args.file, with_headers=not args.no_headers, regex=args.regex, format=args.reader, )
[docs]def run(): print(main())