swift-foundation/Sources/FoundationEssentials/String/String+Internals.swift

//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2023 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

#if FOUNDATION_FRAMEWORK
@_spi(_Unicode) import Swift
internal import Foundation_Private.NSString
#endif

#if canImport(Darwin)
import Darwin
#endif

#if os(Windows)
import WinSDK

extension String {
    /// Invokes `body` with a resolved and potentially `\\?\`-prefixed version of the pointee,
    /// to ensure long paths greater than MAX_PATH (260) characters are handled correctly.
    ///
    /// - parameter relative: Returns the original path without transforming through GetFullPathNameW + PathCchCanonicalizeEx, if the path is relative.
    /// - seealso: https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation
    package func withNTPathRepresentation<Result>(relative: Bool = false, _ body: (UnsafePointer<WCHAR>) throws -> Result) throws -> Result {
        guard !isEmpty else {
            throw CocoaError.errorWithFilePath(.fileReadInvalidFileName, "")
        }

        var iter = self.utf8.makeIterator()
        let bLeadingSlash = if [._slash, ._backslash].contains(iter.next()), iter.next()?.isLetter ?? false, iter.next() == ._colon { true } else { false }

        // Strip the leading `/` on a RFC8089 path (`/[drive-letter]:/...` ).  A
        // leading slash indicates a rooted path on the drive for the current
        // working directory.
        return try Substring(self.utf8.dropFirst(bLeadingSlash ? 1 : 0)).withCString(encodedAs: UTF16.self) { pwszPath in
            if relative && PathIsRelativeW(pwszPath) {
                return try body(pwszPath)
            }

            // 1. Normalize the path first.
            // Contrary to the documentation, this works on long paths independently
            // of the registry or process setting to enable long paths (but it will also
            // not add the \\?\ prefix required by other functions under these conditions).
            let dwLength: DWORD = GetFullPathNameW(pwszPath, 0, nil, nil)
            return try withUnsafeTemporaryAllocation(of: WCHAR.self, capacity: Int(dwLength)) { pwszFullPath in
                guard (1..<dwLength).contains(GetFullPathNameW(pwszPath, DWORD(pwszFullPath.count), pwszFullPath.baseAddress, nil)) else {
                    throw CocoaError.errorWithFilePath(self, win32: GetLastError(), reading: true)
                }

                // 1.5 Leave \\.\ prefixed paths alone since device paths are already an exact representation and PathCchCanonicalizeEx will mangle these.
                if let base = pwszFullPath.baseAddress,
                    base[0] == UInt16(UInt8._backslash),
                    base[1] == UInt16(UInt8._backslash),
                    base[2] == UInt16(UInt8._period),
                    base[3] == UInt16(UInt8._backslash) {
                    return try body(base)
                }

                // 2. Canonicalize the path.
                // This will add the \\?\ prefix if needed based on the path's length.
                var pwszCanonicalPath: LPWSTR?
                let flags: ULONG = PATHCCH_ALLOW_LONG_PATHS
                let result = PathAllocCanonicalize(pwszFullPath.baseAddress, flags, &pwszCanonicalPath)
                if let pwszCanonicalPath {
                    defer { LocalFree(pwszCanonicalPath) }
                    if result == S_OK {
                        // 3. Perform the operation on the normalized path.
                        return try body(pwszCanonicalPath)
                    }
                }
                throw CocoaError.errorWithFilePath(self, win32: WIN32_FROM_HRESULT(result), reading: true)
            }
        }
    }
}
#endif

extension String {
    package func _trimmingWhitespace() -> String {
        if self.isEmpty {
            return ""
        }

        return String(unicodeScalars._trimmingCharacters {
            $0.properties.isWhitespace
        })
    }

    package init?(_utf16 input: UnsafeBufferPointer<UInt16>) {
        // Allocate input.count * 3 code points since one UTF16 code point may require up to three UTF8 code points when transcoded
        let str = withUnsafeTemporaryAllocation(of: UTF8.CodeUnit.self, capacity: input.count * 3) { contents in
            var count = 0
            let error = transcode(input.makeIterator(), from: UTF16.self, to: UTF8.self, stoppingOnError: true) { codeUnit in
                contents[count] = codeUnit
                count += 1
            }

            guard !error else {
                return nil as String?
            }

            return String._tryFromUTF8(UnsafeBufferPointer(rebasing: contents[..<count]))
        }

        guard let str else {
            return nil
        }
        self = str
    }

    package init?(_utf16 input: UnsafeMutableBufferPointer<UInt16>, count: Int) {
        guard let str = String(_utf16: UnsafeBufferPointer(rebasing: input[..<count])) else {
            return nil
        }
        self = str
    }

    package init?(_utf16 input: UnsafePointer<UInt16>, count: Int) {
        guard let str = String(_utf16: UnsafeBufferPointer(start: input, count: count)) else {
            return nil
        }
        self = str
    }

    enum _NormalizationType {
        case canonical
        case hfsPlus

        fileprivate var setType: BuiltInUnicodeScalarSet.SetType {
            switch self {
            case .canonical: .canonicalDecomposable
            case .hfsPlus: .hfsPlusDecomposable
            }
        }
    }

    private func _decomposed(_ type: String._NormalizationType, into buffer: UnsafeMutableBufferPointer<UInt8>, nullTerminated: Bool = false) -> Int? {
        var copy = self
        return copy.withUTF8 {
            try? $0._decomposed(type, as: Unicode.UTF8.self, into: buffer, nullTerminated: nullTerminated)
        }
    }

    #if canImport(Darwin) || FOUNDATION_FRAMEWORK
    fileprivate func _fileSystemRepresentation(into buffer: UnsafeMutableBufferPointer<CChar>) -> Bool {
        let result = buffer.withMemoryRebound(to: UInt8.self) { rebound in
            _decomposed(.hfsPlus, into: rebound, nullTerminated: true)
        }
        return result != nil
    }

    private var maxFileSystemRepresentationSize: Int {
        // The Darwin file system representation expands the UTF-8 contents to decomposed UTF-8 contents (only decomposing specific scalars)
        // For any given scalar that we decompose, we will increase its UTF-8 length by at most a factor of 3 during decomposition
        // (ex. U+0390 expands from 2 to 6 UTF-8 code-units, U+1D160 expands from 4 to 12 UTF-8 code-units)
        // Therefore in the worst case scenario, the result will be the UTF-8 length multiplied by a factor of 3 plus an additional byte for the null byte
        self.utf8.count * 3 + 1
    }
    #endif

    package func withFileSystemRepresentation<R>(_ block: (UnsafePointer<CChar>?) throws -> R) rethrows -> R {
        #if canImport(Darwin) || FOUNDATION_FRAMEWORK
        try withUnsafeTemporaryAllocation(of: CChar.self, capacity: maxFileSystemRepresentationSize) { buffer in
            guard _fileSystemRepresentation(into: buffer) else {
                return try block(nil)
            }
            return try block(buffer.baseAddress!)
        }
        #else
#if os(Windows)
        var iter = self.utf8.makeIterator()
        let bLeadingSlash = if iter.next() == ._slash, iter.next()?.isLetter ?? false, iter.next() == ._colon { true } else { false }
        // Strip the leading `/` on a RFC8089 path (`/[drive-letter]:/...` ).  A
        // leading slash indicates a rooted path on the drive for the current
        // working directory.
        return try Substring(self.utf8.dropFirst(bLeadingSlash ? 1 : 0)).replacing(._slash, with: ._backslash).withCString {
            try block($0)
        }
#else
        return try withCString {
            try block($0)
        }
#endif
        #endif
    }

    package func withMutableFileSystemRepresentation<R>(_ block: (UnsafeMutablePointer<CChar>?) throws -> R) rethrows -> R {
        #if canImport(Darwin) || FOUNDATION_FRAMEWORK
        try withUnsafeTemporaryAllocation(of: CChar.self, capacity: maxFileSystemRepresentationSize) { buffer in
            guard _fileSystemRepresentation(into: buffer) else {
                return try block(nil)
            }
            return try block(buffer.baseAddress!)
        }
        #else
#if os(Windows)
        var iter = self.utf8.makeIterator()
        let bLeadingSlash = if iter.next() == ._slash, iter.next()?.isLetter ?? false, iter.next() == ._colon { true } else { false }
        var mut: String =
            Substring(self.utf8[self.utf8.index(self.utf8.startIndex, offsetBy: bLeadingSlash ? 1 : 0)...])
                .replacing(._slash, with: ._backslash)
#else
        var mut: String = self
#endif

        return try mut.withUTF8 { utf8Buffer in
            // Leave space for a null byte at the end
            try withUnsafeTemporaryAllocation(of: CChar.self, capacity: utf8Buffer.count + 1) { temporaryBuffer in
                try utf8Buffer.withMemoryRebound(to: CChar.self) { utf8CCharBuffer in
                    let nullByteIndex = temporaryBuffer.initialize(fromContentsOf: utf8CCharBuffer)
                    // Null-terminate
                    temporaryBuffer.initializeElement(at: nullByteIndex, to: CChar(0))
                    let result = try block(temporaryBuffer.baseAddress)
                    temporaryBuffer.prefix(through: nullByteIndex).deinitialize()
                    return result
                }
            }
        }
        #endif
    }

}

extension UnsafeBufferPointer {
    private enum DecompositionError : Error {
        case insufficientSpace
        case illegalScalar
        case decodingError
    }

    fileprivate func _decomposedRebinding<T: UnicodeCodec, InputElement>(_ type: String._NormalizationType, as codec: T.Type, into buffer: UnsafeMutableBufferPointer<InputElement>, nullTerminated: Bool = false) throws -> Int {
        try self.withMemoryRebound(to: T.CodeUnit.self) { reboundSelf in
            try buffer.withMemoryRebound(to: Unicode.UTF8.CodeUnit.self) { reboundBuffer in
                try reboundSelf._decomposed(type, as: codec, into: reboundBuffer, nullTerminated: nullTerminated)
            }
        }
    }

    fileprivate func _decomposed<T: UnicodeCodec>(_ type: String._NormalizationType, as codec: T.Type, into buffer: UnsafeMutableBufferPointer<UInt8>, nullTerminated: Bool = false) throws -> Int where Element == T.CodeUnit {
        let scalarSet = BuiltInUnicodeScalarSet(type: type.setType)
        var bufferIdx = 0
        let bufferLength = buffer.count
        var sortBuffer: [UnicodeScalar] = []
        var seenNullIdx: Int? = nil
        var decoder = T()
        var iterator = self.makeIterator()

        guard !buffer.isEmpty else {
            if !nullTerminated && iterator.next() == nil {
                // No bytes to write, so an empty buffer is OK
                return 0
            } else {
                throw DecompositionError.insufficientSpace
            }
        }

        defer {
            if nullTerminated {
                // Ensure buffer is always null-terminated even on failure to prevent buffer over-reads
                // At this point, the buffer is known to be non-empty, so it must have space for at least a null terminating byte (even if it overwrites the final output byte in the buffer)
                if bufferIdx < bufferLength {
                    // We still have space left in the buffer - if we haven't already null-terminated then add a null byte to the buffer
                    // Since we have space, we only want to write the null byte when/where we have to since some clients provide buffer sizes that don't match the true buffer length
                    if bufferIdx == buffer.startIndex || buffer[bufferIdx - 1] != 0 {
                        buffer[bufferIdx] = 0
                    }
                } else {
                    // The buffer is non-empty but we've completely filled it, overwrite the last written byte with a null byte to ensure null termination
                    buffer[buffer.count - 1] = 0
                }
            }
        }

        func appendOutput(_ values: some Collection<UInt8>) throws {
            let bufferPortion = UnsafeMutableBufferPointer(start: buffer.baseAddress!.advanced(by: bufferIdx), count: bufferLength - bufferIdx)
            guard bufferPortion.count >= values.count else {
                throw DecompositionError.insufficientSpace
            }
            bufferIdx += bufferPortion.initialize(fromContentsOf: values)
        }

        func appendOutput(_ value: UInt8) throws {
            guard bufferIdx < bufferLength else {
                throw DecompositionError.insufficientSpace
            }
            buffer.initializeElement(at: bufferIdx, to: value)
            bufferIdx += 1
        }

        func encodedScalar(_ scalar: UnicodeScalar) throws -> some Collection<UInt8> {
            guard let encoded = UTF8.encode(scalar) else {
                throw DecompositionError.illegalScalar
            }
            return encoded
        }

        func fillFromSortBuffer() throws {
            guard !sortBuffer.isEmpty else { return }
            sortBuffer.sort {
                $0.properties.canonicalCombiningClass.rawValue < $1.properties.canonicalCombiningClass.rawValue
            }
            for scalar in sortBuffer {
                try appendOutput(encodedScalar(scalar))
            }
            sortBuffer.removeAll(keepingCapacity: true)
        }

        decodingLoop: while bufferIdx < bufferLength {
            var scalar: UnicodeScalar
            switch decoder.decode(&iterator) {
            // We've finished the input, return the index
            case .emptyInput: break decodingLoop
            case .error: throw DecompositionError.decodingError
            case .scalarValue(let v): scalar = v
            }

            if scalar.value == 0 {
                // Null bytes within the string are fine as long as they are at the end
                seenNullIdx = bufferIdx
            } else if seenNullIdx != nil {
                // File system representations are c-strings that do not support embedded null bytes
                throw DecompositionError.illegalScalar
            }

            let isASCII = scalar.isASCII
            if isASCII || scalar.properties.canonicalCombiningClass == .notReordered {
                try fillFromSortBuffer()
            }

            if isASCII {
                try appendOutput(UInt8(scalar.value))
            } else {
#if FOUNDATION_FRAMEWORK
                // Only decompose scalars present in the declared set
                if scalarSet.contains(scalar) {
                    sortBuffer.append(contentsOf: String(scalar)._nfd)
                } else {
                    // Even if a scalar isn't decomposed, it may still need to be re-ordered
                    sortBuffer.append(scalar)
                }
#else
                // TODO: Implement Unicode decomposition in swift-foundation
                sortBuffer.append(scalar)
#endif
            }
        }
        try fillFromSortBuffer()

        if iterator.next() != nil {
            throw DecompositionError.insufficientSpace
        } else {
            if let seenNullIdx {
                return seenNullIdx + 1
            }
            if nullTerminated {
                try appendOutput(0)
            }
            return bufferIdx
        }
    }
}

#if FOUNDATION_FRAMEWORK
@objc
extension NSString {
    @objc
    func __swiftFillFileSystemRepresentation(pointer: UnsafeMutablePointer<CChar>, maxLength: Int) -> Bool {
        autoreleasepool {
            let buffer = UnsafeMutableBufferPointer(start: pointer, count: maxLength)

            guard !buffer.isEmpty else {
                // No space for a null terminating byte, so it's not worth even trying to read the string contents
                return false
            }

            // See if we have a quick-access buffer we can just convert directly
            if let fastCharacters = self._fastCharacterContents() {
                // If we have quick access to UTF-16 contents, decompose from UTF-16
                let charsBuffer = UnsafeBufferPointer(start: fastCharacters, count: self.length)
                return (try? charsBuffer._decomposedRebinding(.hfsPlus, as: Unicode.UTF16.self, into: buffer, nullTerminated: true)) != nil
            } else if self.fastestEncoding == NSASCIIStringEncoding, let fastUTF8 = self._fastCStringContents(false) {
                // If we have quick access to ASCII contents, no need to decompose
                let utf8Buffer = UnsafeBufferPointer(start: fastUTF8, count: self.length)

                // We only allow embedded nulls if there are no non-null characters following the first null character
                if let embeddedNullIdx = utf8Buffer.firstIndex(of: 0) {
                    if !utf8Buffer[embeddedNullIdx...].allSatisfy({ $0 == 0 }) {
                        // Ensure the buffer is always null-terminated even on failure to prevent buffer over-reads - at this point we know the buffer is non-empty
                        buffer[0] = 0
                        return false
                    }
                }

                var (leftoverIterator, next) = buffer.initialize(from: utf8Buffer)
                guard leftoverIterator.next() == nil && next < buffer.endIndex else {
                    // Ensure the buffer is always null-terminated even on failure to prevent buffer over-reads
                    buffer[buffer.endIndex - 1] = 0
                    return false
                }
                buffer[next] = 0
                return true
            } else {
                // Otherwise, bridge to a String which will create a UTF-8 buffer
                return String(self)._fileSystemRepresentation(into: buffer)
            }
        }
    }
}
#endif