Source code for prance.util.url

"""This submodule contains code for fetching/parsing URLs."""

__author__ = "Jens Finkhaeuser"
__copyright__ = "Copyright (c) 2016-2018 Jens Finkhaeuser"
__license__ = "MIT"
__all__ = ()


from urllib import parse


[docs]class ResolutionError(LookupError): pass
[docs]def urlresource(url): """ Return the resource part of a parsed URL. The resource part is defined as the part without query, parameters or fragment. Just the scheme, netloc and path remains. :param tuple url: A parsed URL :return: The resource part of the URL :rtype: str """ res_list = list(url)[0:3] + [None, None, None] return parse.ParseResult(*res_list).geturl()
[docs]def absurl(url, relative_to=None): """ Turn relative file URLs into absolute file URLs. This is necessary, because while JSON pointers do not allow relative file URLs, Swagger/OpenAPI explicitly does. We need to make relative paths absolute before passing them off to jsonschema for verification. Non-file URLs are left untouched. URLs without scheme are assumed to be file URLs. :param str/tuple url: The input URL. :param str/tuple relative_to: [optional] The URL to which the input URL is relative. :return: The output URL, parsed into components. :rtype: tuple """ # Parse input URL, if necessary parsed = url if not isinstance(parsed, tuple): from .fs import is_pathname_valid if is_pathname_valid(url): from . import fs url = fs.to_posix(url) try: parsed = parse.urlparse(url) except Exception as ex: from .exceptions import raise_from raise_from(ResolutionError, ex, f"Unable to parse url: {url}") # Any non-file scheme we just return immediately. if parsed.scheme not in (None, "", "file"): return parsed # Parse up the reference URL reference = relative_to if reference and not isinstance(reference, tuple): from .fs import is_pathname_valid if is_pathname_valid(reference): from . import fs reference = fs.to_posix(reference) reference = parse.urlparse(reference) # If the input URL has no path, we assume only its fragment matters. # That is, we'll have to set the fragment of the reference URL to that # of the input URL, and return the result. import os.path from .fs import from_posix, abspath result_list = None if not parsed.path: if not reference or not reference.path: raise ResolutionError( "Cannot build an absolute file URL from a fragment" " without a reference with path!" ) result_list = list(reference) result_list[5] = parsed.fragment elif os.path.isabs(from_posix(parsed.path)): # We have an absolute path, so we can ignore the reference entirely! result_list = list(parsed) result_list[0] = "file" # in case it was empty else: # If we have a relative path, we require a reference. if not reference: raise ResolutionError( "Cannot build an absolute file URL from a relative" " path without a reference!" ) if reference.scheme not in (None, "", "file"): raise ResolutionError( "Cannot build an absolute file URL with a non-file" " reference!" ) result_list = list(parsed) result_list[0] = "file" # in case it was empty result_list[2] = abspath(from_posix(parsed.path), from_posix(reference.path)) # Reassemble the result and return it result = parse.ParseResult(*result_list) return result
[docs]def split_url_reference(base_url, reference): """ Return a normalized, parsed URL and object path. The reference string is a JSON reference, i.e. a URL with a fragment that contains an object path into the referenced resource. The base URL is used as a reference point for relative references. :param mixed base_url: A parsed URL. :param str reference: A JSON reference string. :return: The parsed absolute URL of the reference and the object path. """ # Parse URL parsed_url = absurl(reference, base_url) # Grab object path obj_path = parsed_url.fragment.split("/") while len(obj_path) and not obj_path[0]: obj_path = obj_path[1:] # Normalize the object path by substituting ~1 and ~0 respectively. def _normalize(path): path = path.replace("~1", "/") path = path.replace("~0", "~") return path obj_path = [_normalize(p) for p in obj_path] return parsed_url, obj_path
[docs]def fetch_url_text(url, cache={}, encoding=None): """ Fetch the URL. If the URL is a file URL, the format used for parsing depends on the file extension. Otherwise, YAML is assumed. The URL may also use the `python` scheme. In this scheme, the netloc part refers to an importable python package, and the path part to a path relative to the package path, e.g. `python://some_package/path/to/file.yaml`. :param tuple url: The url, parsed as returned by `absurl` above. :param Mapping cache: An optional cache. If the URL can be found in the cache, return the cache contents. :param str encoding: Provide an encoding for local URLs to override encoding detection, if desired. Defaults to None. :return: The resource text of the URL, and the content type. :rtype: tuple """ url_key = "text_" + urlresource(url) entry = cache.get(url_key, None) if entry is not None: return entry # Fetch contents according to scheme. We assume requests can handle all the # non-file schemes, or throw otherwise. content = None content_type = None if url.scheme in (None, "", "file"): from .fs import read_file, from_posix try: content = read_file(from_posix(url.path), encoding) except FileNotFoundError as ex: from .exceptions import raise_from raise_from(ResolutionError, ex, f"File not found: {url.path}") elif url.scheme == "python": # Resolve package path package = url.netloc path = url.path if path[0] == "/": path = path[1:] import pkg_resources path = pkg_resources.resource_filename(package, path) from .fs import read_file, from_posix content = read_file(from_posix(path), encoding) else: import requests response = requests.get(url.geturl()) if not response.ok: # pragma: nocover raise ResolutionError( 'Cannot fetch URL "%s": %d %s' % (url.geturl(), response.status_code, response.reason) ) content_type = response.headers.get("content-type", "text/plain") content = response.text cache[url_key] = (content, content_type) return content, content_type
[docs]def fetch_url(url, cache={}, encoding=None, strict=True): """ Fetch the URL and parse the contents. Same as fetch_url_text(), but also parses the content and only returns the parse results. :param tuple url: The url, parsed as returned by `absurl` above. :param Mapping cache: An optional cache. If the URL can be found in the cache, return the cache contents. :param str encoding: Provide an encoding for local URLs to override encoding detection, if desired. Defaults to None. :return: The parsed file. :rtype: dict """ # Return from cache, if parsed result is already present. url_key = (urlresource(url), strict) entry = cache.get(url_key, None) if entry is not None: return entry.copy() # Fetch URL text content, content_type = fetch_url_text(url, cache, encoding=encoding) # Parse the result from .formats import parse_spec result = parse_spec(content, url.path, content_type=content_type) # Perform some sanitization in lenient mode. if not strict: from . import stringify_keys result = stringify_keys(result) # Cache and return result cache[url_key] = result return result.copy()