Source code for prance.util.fs

"""This submodule contains file system utilities for Prance."""

__author__ = "Jens Finkhaeuser"
__copyright__ = "Copyright (c) 2016-2019 Jens Finkhaeuser"
__license__ = "MIT"
__all__ = ()


# Re-define an error for backwards compatibility
FileNotFoundError = FileNotFoundError  # pragma: no cover


# The following constant and function are taken from
# https://stackoverflow.com/questions/9532499/check-whether-a-path-is-valid-in-python-without-creating-a-file-at-the-paths-ta

# Sadly, Python fails to provide the following magic number for us.
_ERROR_INVALID_NAME = 123
"""
Windows-specific error code indicating an invalid pathname.

See Also
----------
https://msdn.microsoft.com/en-us/library/windows/desktop/ms681382%28v=vs.85%29.aspx
    Official listing of all such codes.
"""


# Following Microsoft documentation, set the default read size for detecting
# a file encoding to a multiple of 4k that seems to work well on various OSes
# and volume sizes.
# https://support.microsoft.com/en-us/help/140365/default-cluster-size-for-ntfs-fat-and-exfat
_READ_CHUNK_SIZE = 64 * 1024
"""
Default read size for detecting file encoding.
"""


[docs]def is_pathname_valid(pathname): """ Test whether a path name is valid. :return: True if the passed pathname is valid on the current OS, False otherwise. :rtype: bool """ import errno, os # If this pathname is either not a string or is but is empty, this pathname # is invalid. try: if not isinstance(pathname, str) or not pathname: return False # Strip this pathname's Windows-specific drive specifier (e.g., `C:\`) # if any. Since Windows prohibits path components from containing `:` # characters, failing to strip this `:`-suffixed prefix would # erroneously invalidate all valid absolute Windows pathnames. _, pathname = os.path.splitdrive(pathname) # Directory guaranteed to exist. If the current OS is Windows, this is # the drive to which Windows was installed (e.g., the "%SYSTEMDRIVE%" # environment variable); else, the typical root directory. # The %systemdrive% (typically c:) is the partition with # the %systemroot% (typically Windows) directory. import sys root_dirname = ( os.environ.get("SYSTEMDRIVE", "C:") if sys.platform == "win32" else os.path.sep ) assert os.path.isdir(root_dirname) # ...Murphy and her ironclad Law # Append a path separator to this directory if needed. root_dirname = root_dirname.rstrip(os.path.sep) + os.path.sep # Test whether each path component split from this pathname is valid or # not, ignoring non-existent and non-readable path components. for pathname_part in pathname.split(os.path.sep): try: os.lstat(root_dirname + pathname_part) except OSError as exc: # If an OS-specific exception is raised, its error code # indicates whether this pathname is valid or not. Unless this # is the case, this exception implies an ignorable kernel or # filesystem complaint (e.g., path not found or inaccessible). # # Only the following exceptions indicate invalid pathnames: # # * Instances of the Windows-specific "WindowsError" class # defining the "winerror" attribute whose value is # "_ERROR_INVALID_NAME". Under Windows, "winerror" is more # fine-grained and hence useful than the generic "errno" # attribute. When a too-long pathname is passed, for example, # "errno" is "ENOENT" (i.e., no such file or directory) rather # than "ENAMETOOLONG" (i.e., file name too long). # * Instances of the cross-platform "OSError" class defining the # generic "errno" attribute whose value is either: # * Under most POSIX-compatible OSes, "ENAMETOOLONG". # * Under some edge-case OSes (e.g., SunOS, *BSD), "ERANGE". if hasattr(exc, "winerror"): # pragma: nocover if exc.winerror == _ERROR_INVALID_NAME: return False elif exc.errno in {errno.ENAMETOOLONG, errno.ERANGE}: return False # If a "TypeError" exception was raised, it almost certainly has the # error message "embedded NUL character" indicating an invalid pathname. except TypeError: # pragma: nocover return False # Null-bytes may also cause this, and they are invalid. except ValueError: return False # If no exception was raised, all path components and hence this # pathname itself are valid. (Praise be to the curmudgeonly python.) else: return True
# If any other exception was raised, this is an unrelated fatal issue # (e.g., a bug). Permit this exception to unwind the call stack. # # Did we mention this should be shipped with Python already?
[docs]def from_posix(fname): """ Convert a path from posix-like, to the platform format. :param str fname: The filename in posix-like format. :return: The filename in the format of the platform. :rtype: str """ import sys if sys.platform == "win32": # pragma: nocover if fname[0] == "/": fname = fname[1:] fname = fname.replace("/", "\\") return fname
[docs]def to_posix(fname): """ Convert a path to posix-like format. :param str fname: The filename to convert to posix format. :return: The filename in posix-like format. :rtype: str """ import sys if sys.platform == "win32": # pragma: nocover import os.path if os.path.isabs(fname): fname = "/" + fname fname = fname.replace("\\", "/") return fname
[docs]def abspath(filename, relative_to=None): """ Return the absolute path of a file relative to a reference file. If no reference file is given, this function works identical to `canonical_filename`. :param str filename: The filename to make absolute. :param str relative_to: [optional] the reference file name. :return: The absolute path :rtype: str """ # Create filename relative to the reference, if it exists. import os.path fname = from_posix(filename) if relative_to and not os.path.isabs(fname): relative_to = from_posix(relative_to) if os.path.isdir(relative_to): fname = os.path.join(relative_to, fname) else: fname = os.path.join(os.path.dirname(relative_to), fname) # Make the result canonical fname = canonical_filename(fname) return to_posix(fname)
[docs]def canonical_filename(filename): """ Return the canonical version of a file name. The canonical version is defined as the absolute path, and all file system links dereferenced. :param str filename: The filename to make canonical. :return: The canonical filename. :rtype: str """ import os, os.path path = from_posix(filename) while True: path = os.path.abspath(path) try: p = os.path.dirname(path) # os.readlink doesn't exist in windows python2.7 try: deref_path = os.readlink(path) except AttributeError: # pragma: no cover return path path = os.path.join(p, deref_path) except OSError: return path
[docs]def detect_encoding(filename, default_to_utf8=True, **kwargs): """ Detect the named file's character encoding. If the first parts of the file appear to be ASCII, this function returns 'UTF-8', as that's a safe superset of ASCII. This can be switched off by changing the `default_to_utf8` parameter. :param str filename: The name of the file to detect the encoding of. :param bool default_to_utf8: Defaults to True. Set to False to disable treating ASCII files as UTF-8. :param bool read_all: Keyword argument; if True, reads the entire file for encoding detection. :return: The file encoding. :rtype: str """ # Read some of the file import os.path filename = from_posix(filename) file_len = os.path.getsize(filename) read_len = min(_READ_CHUNK_SIZE, file_len) # ... unless we're supposed to! if kwargs.get("read_all", False): read_len = file_len # Read the first read_len bytes raw, so we can detect the encoding with open(filename, "rb") as raw_handle: raw = raw_handle.read(read_len) # Detect the encoding the file specifies, if any. import codecs if raw.startswith(codecs.BOM_UTF8): encoding = "utf-8-sig" else: # Detect encoding using the best detector available try: # First try ICU. ICU will report ASCII in the first 32 Bytes as # ISO-8859-1, which isn't exactly wrong, but maybe optimistic. import icu encoding = icu.CharsetDetector(raw).detect().getName().lower() except ImportError: # pragma: nocover # If that doesn't work, try chardet - it's not got native components, # which is a bonus in some environments, but it's not as precise. import chardet encoding = chardet.detect(raw)["encoding"].lower() # Chardet is more brutal in that it reports ASCII if none of the first # Bytes contain high bits. To emulate ICU, we just bump up the detected # encoding. if encoding == "ascii": encoding = "iso-8859-1" # Both chardet and ICU may detect ISO-8859-x, which may not be possible # to decode as UTF-8. So whatever they report, we'll try decoding as # UTF-8 before reporting it. if default_to_utf8 and encoding in ("ascii", "iso-8859-1", "windows-1252"): # Try decoding as utf-8 try: raw.decode("utf-8") # If this worked... well there's no guarantee it's utf-8, to be # honest. encoding = "utf-8" except UnicodeDecodeError: # Decoding as utf-8 failed, so we can't default to it. pass return encoding
[docs]def read_file(filename, encoding=None): """ Read and decode a file, taking BOMs into account. :param str filename: The name of the file to read. :param str encoding: The encoding to use. If not given, detect_encoding is used to determine the encoding. :return: The file contents. :rtype: unicode string """ filename = from_posix(filename) if not encoding: # Detect encoding encoding = detect_encoding(filename) # Finally, read the file in the detected encoding with open(filename, encoding=encoding) as handle: return handle.read()
[docs]def write_file(filename, contents, encoding=None): """ Write a file with the given encoding. The default encoding is 'utf-8'. It's recommended not to change that for JSON or YAML output. :param str filename: The name of the file to read. :param str contents: The file contents to write. :param str encoding: The encoding to use. If not given, detect_encoding is used to determine the encoding. """ if not encoding: encoding = "utf-8" fname = from_posix(filename) with open(fname, mode="w", encoding=encoding) as handle: handle.write(contents)