Source code for prance.util.fs

"""This submodule contains file system utilities for Prance."""

__author__ = "Jens Finkhaeuser"
__copyright__ = "Copyright (c) 2016-2019 Jens Finkhaeuser"
__license__ = "MIT"
__all__ = ()


# Re-define an error for backwards compatibility
FileNotFoundError = FileNotFoundError  # pragma: no cover


# The following constant and function are taken from
# https://stackoverflow.com/questions/9532499/check-whether-a-path-is-valid-in-python-without-creating-a-file-at-the-paths-ta

# Sadly, Python fails to provide the following magic number for us.
_ERROR_INVALID_NAME = 123
"""
Windows-specific error code indicating an invalid pathname.

See Also
----------
https://msdn.microsoft.com/en-us/library/windows/desktop/ms681382%28v=vs.85%29.aspx
    Official listing of all such codes.
"""


# Following Microsoft documentation, set the default read size for detecting
# a file encoding to a multiple of 4k that seems to work well on various OSes
# and volume sizes.
# https://support.microsoft.com/en-us/help/140365/default-cluster-size-for-ntfs-fat-and-exfat
_READ_CHUNK_SIZE = 64 * 1024
"""
Default read size for detecting file encoding.
"""


[docs]def is_pathname_valid(pathname):
    """
    Test whether a path name is valid.

    :return: True if the passed pathname is valid on the current OS, False
        otherwise.
    :rtype: bool
    """
    import errno, os

    # If this pathname is either not a string or is but is empty, this pathname
    # is invalid.
    try:
        if not isinstance(pathname, str) or not pathname:
            return False

        # Strip this pathname's Windows-specific drive specifier (e.g., `C:\`)
        # if any. Since Windows prohibits path components from containing `:`
        # characters, failing to strip this `:`-suffixed prefix would
        # erroneously invalidate all valid absolute Windows pathnames.
        _, pathname = os.path.splitdrive(pathname)

        # Directory guaranteed to exist. If the current OS is Windows, this is
        # the drive to which Windows was installed (e.g., the "%SYSTEMDRIVE%"
        # environment variable); else, the typical root directory.
        # The %systemdrive% (typically c:) is the partition with
        # the %systemroot% (typically Windows) directory.
        import sys

        root_dirname = (
            os.environ.get("SYSTEMDRIVE", "C:")
            if sys.platform == "win32"
            else os.path.sep
        )
        assert os.path.isdir(root_dirname)  # ...Murphy and her ironclad Law

        # Append a path separator to this directory if needed.
        root_dirname = root_dirname.rstrip(os.path.sep) + os.path.sep

        # Test whether each path component split from this pathname is valid or
        # not, ignoring non-existent and non-readable path components.
        for pathname_part in pathname.split(os.path.sep):
            try:
                os.lstat(root_dirname + pathname_part)
            except OSError as exc:
                # If an OS-specific exception is raised, its error code
                # indicates whether this pathname is valid or not. Unless this
                # is the case, this exception implies an ignorable kernel or
                # filesystem complaint (e.g., path not found or inaccessible).
                #
                # Only the following exceptions indicate invalid pathnames:
                #
                # * Instances of the Windows-specific "WindowsError" class
                #   defining the "winerror" attribute whose value is
                #   "_ERROR_INVALID_NAME". Under Windows, "winerror" is more
                #   fine-grained and hence useful than the generic "errno"
                #   attribute. When a too-long pathname is passed, for example,
                #   "errno" is "ENOENT" (i.e., no such file or directory) rather
                #   than "ENAMETOOLONG" (i.e., file name too long).
                # * Instances of the cross-platform "OSError" class defining the
                #   generic "errno" attribute whose value is either:
                #   * Under most POSIX-compatible OSes, "ENAMETOOLONG".
                #   * Under some edge-case OSes (e.g., SunOS, *BSD), "ERANGE".
                if hasattr(exc, "winerror"):  # pragma: nocover
                    if exc.winerror == _ERROR_INVALID_NAME:
                        return False
                elif exc.errno in {errno.ENAMETOOLONG, errno.ERANGE}:
                    return False
    # If a "TypeError" exception was raised, it almost certainly has the
    # error message "embedded NUL character" indicating an invalid pathname.
    except TypeError:  # pragma: nocover
        return False
    # Null-bytes may also cause this, and they are invalid.
    except ValueError:
        return False
    # If no exception was raised, all path components and hence this
    # pathname itself are valid. (Praise be to the curmudgeonly python.)
    else:
        return True
    # If any other exception was raised, this is an unrelated fatal issue
    # (e.g., a bug). Permit this exception to unwind the call stack.
    #
    # Did we mention this should be shipped with Python already?


[docs]def from_posix(fname):
    """
    Convert a path from posix-like, to the platform format.

    :param str fname: The filename in posix-like format.
    :return: The filename in the format of the platform.
    :rtype: str
    """
    import sys

    if sys.platform == "win32":  # pragma: nocover
        if fname[0] == "/":
            fname = fname[1:]
        fname = fname.replace("/", "\\")
    return fname


[docs]def to_posix(fname):
    """
    Convert a path to posix-like format.

    :param str fname: The filename to convert to posix format.
    :return: The filename in posix-like format.
    :rtype: str
    """
    import sys

    if sys.platform == "win32":  # pragma: nocover
        import os.path

        if os.path.isabs(fname):
            fname = "/" + fname
        fname = fname.replace("\\", "/")
    return fname


[docs]def abspath(filename, relative_to=None):
    """
    Return the absolute path of a file relative to a reference file.

    If no reference file is given, this function works identical to
    `canonical_filename`.

    :param str filename: The filename to make absolute.
    :param str relative_to: [optional] the reference file name.
    :return: The absolute path
    :rtype: str
    """
    # Create filename relative to the reference, if it exists.
    import os.path

    fname = from_posix(filename)
    if relative_to and not os.path.isabs(fname):
        relative_to = from_posix(relative_to)
        if os.path.isdir(relative_to):
            fname = os.path.join(relative_to, fname)
        else:
            fname = os.path.join(os.path.dirname(relative_to), fname)

    # Make the result canonical
    fname = canonical_filename(fname)
    return to_posix(fname)


[docs]def canonical_filename(filename):
    """
    Return the canonical version of a file name.

    The canonical version is defined as the absolute path, and all file system
    links dereferenced.

    :param str filename: The filename to make canonical.
    :return: The canonical filename.
    :rtype: str
    """
    import os, os.path

    path = from_posix(filename)
    while True:
        path = os.path.abspath(path)
        try:
            p = os.path.dirname(path)
            # os.readlink doesn't exist in windows python2.7
            try:
                deref_path = os.readlink(path)
            except AttributeError:  # pragma: no cover
                return path
            path = os.path.join(p, deref_path)
        except OSError:
            return path


[docs]def detect_encoding(filename, default_to_utf8=True, **kwargs):
    """
    Detect the named file's character encoding.

    If the first parts of the file appear to be ASCII, this function returns
    'UTF-8', as that's a safe superset of ASCII. This can be switched off by
    changing the `default_to_utf8` parameter.

    :param str filename: The name of the file to detect the encoding of.
    :param bool default_to_utf8: Defaults to True. Set to False to disable
        treating ASCII files as UTF-8.
    :param bool read_all: Keyword argument; if True, reads the entire file
        for encoding detection.
    :return: The file encoding.
    :rtype: str
    """
    # Read some of the file
    import os.path

    filename = from_posix(filename)
    file_len = os.path.getsize(filename)
    read_len = min(_READ_CHUNK_SIZE, file_len)

    # ... unless we're supposed to!
    if kwargs.get("read_all", False):
        read_len = file_len

    # Read the first read_len bytes raw, so we can detect the encoding
    with open(filename, "rb") as raw_handle:
        raw = raw_handle.read(read_len)

    # Detect the encoding the file specifies, if any.
    import codecs

    if raw.startswith(codecs.BOM_UTF8):
        encoding = "utf-8-sig"
    else:
        # Detect encoding using the best detector available
        try:
            # First try ICU. ICU will report ASCII in the first 32 Bytes as
            # ISO-8859-1, which isn't exactly wrong, but maybe optimistic.
            import icu

            encoding = icu.CharsetDetector(raw).detect().getName().lower()
        except ImportError:  # pragma: nocover
            # If that doesn't work, try chardet - it's not got native components,
            # which is a bonus in some environments, but it's not as precise.
            import chardet

            encoding = chardet.detect(raw)["encoding"].lower()

            # Chardet is more brutal in that it reports ASCII if none of the first
            # Bytes contain high bits. To emulate ICU, we just bump up the detected
            # encoding.
            if encoding == "ascii":
                encoding = "iso-8859-1"

        # Both chardet and ICU may detect ISO-8859-x, which may not be possible
        # to decode as UTF-8. So whatever they report, we'll try decoding as
        # UTF-8 before reporting it.
        if default_to_utf8 and encoding in ("ascii", "iso-8859-1", "windows-1252"):
            # Try decoding as utf-8
            try:
                raw.decode("utf-8")
                # If this worked... well there's no guarantee it's utf-8, to be
                # honest.
                encoding = "utf-8"
            except UnicodeDecodeError:
                # Decoding as utf-8 failed, so we can't default to it.
                pass

    return encoding


[docs]def read_file(filename, encoding=None):
    """
    Read and decode a file, taking BOMs into account.

    :param str filename: The name of the file to read.
    :param str encoding: The encoding to use. If not given, detect_encoding is
        used to determine the encoding.
    :return: The file contents.
    :rtype: unicode string
    """
    filename = from_posix(filename)
    if not encoding:
        # Detect encoding
        encoding = detect_encoding(filename)

    # Finally, read the file in the detected encoding
    with open(filename, encoding=encoding) as handle:
        return handle.read()


[docs]def write_file(filename, contents, encoding=None):
    """
    Write a file with the given encoding.

    The default encoding is 'utf-8'. It's recommended not to change that for
    JSON or YAML output.

    :param str filename: The name of the file to read.
    :param str contents: The file contents to write.
    :param str encoding: The encoding to use. If not given, detect_encoding is
        used to determine the encoding.
    """
    if not encoding:
        encoding = "utf-8"

    fname = from_posix(filename)
    with open(fname, mode="w", encoding=encoding) as handle:
        handle.write(contents)
prance

Navigation

Related Topics

Source code for prance.util.fs