777 lines
26 KiB
Swift

//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
package extension UTF8.CodeUnit {
static let newline: Self = 0x0A
static let carriageReturn: Self = 0x0D
var _numericValue: Int? {
if self >= 48 && self <= 57 {
return Int(self - 48)
}
return nil
}
// Copied from std; see comment in String.swift _uppercaseASCII() and _lowercaseASCII()
var _lowercased: Self {
let _uppercaseTable: UInt64 =
0b0000_0000_0000_0000_0001_1111_1111_1111 &<< 32
let isUpper = _uppercaseTable &>> UInt64(((self &- 1) & 0b0111_1111) &>> 1)
let toAdd = (isUpper & 0x1) &<< 5
return self &+ UInt8(truncatingIfNeeded: toAdd)
}
var _uppercased: Self {
let _lowercaseTable: UInt64 =
0b0001_1111_1111_1111_0000_0000_0000_0000 &<< 32
let isLower = _lowercaseTable &>> UInt64(((self &- 1) & 0b0111_1111) &>> 1)
let toSubtract = (isLower & 0x1) &<< 5
return self &- UInt8(truncatingIfNeeded: toSubtract)
}
}
// MARK: - _StringCompareOptionsIterable Methods
// Internal protocols to share the implementation for iterating BidirectionalCollections of String family and process their elements according to String.CompareOptions.
internal protocol _StringCompareOptionsConvertible : Comparable & Equatable {
associatedtype IterableType: _StringCompareOptionsIterable
func _transform(toHalfWidth: Bool, stripDiacritics: Bool, caseFolding: Bool) -> IterableType
var intValue: Int? { get }
var isExtendCharacter: Bool { get }
}
internal protocol _StringCompareOptionsIterable : BidirectionalCollection where Element: _StringCompareOptionsConvertible, Element.IterableType.SubSequence == Self.SubSequence, Element == SubSequence.Element {
init()
var first: Element? { get }
func _consumeExtendCharacters(from i: inout Index)
func consumeNumbers(from i: inout Index, initialValue: Int) -> Int
}
extension _StringCompareOptionsIterable {
func consumeNumbers(from i: inout Index, initialValue: Int) -> Int {
guard i < endIndex else {
return initialValue
}
var value = initialValue
while i < endIndex {
let c = self[i]
guard let num = c.intValue else {
break
}
// equivalent to `value = value * 10 + num` but considering overflow
let multiplied = value.multipliedReportingOverflow(by: 10)
guard !multiplied.overflow else { break }
let added = multiplied.partialValue.addingReportingOverflow(num)
guard !added.overflow else { break }
value = added.partialValue
self.formIndex(after: &i)
}
return value
}
func _consumeExtendCharacters(from i: inout Index) {
while i < endIndex, self[i].isExtendCharacter {
formIndex(after: &i)
}
}
func _compare<S: _StringCompareOptionsIterable>(_ other: S, toHalfWidth: Bool, diacriticsInsensitive: Bool, caseFold: Bool, numeric: Bool, forceOrdering: Bool) -> ComparisonResult where S.Element == Element {
var idx1 = self.startIndex
var idx2 = other.startIndex
var compareResult: ComparisonResult = .orderedSame
var norm1 = _StringCompareOptionsIterableBuffer<Element.IterableType>()
var norm2 = _StringCompareOptionsIterableBuffer<Element.IterableType>()
while idx1 < self.endIndex && idx2 < other.endIndex {
var c1: Element
var c2: Element
if norm1.isEmpty {
c1 = self[idx1]
} else {
c1 = norm1.current
norm1.advance()
}
if norm2.isEmpty {
c2 = other[idx2]
} else {
c2 = norm2.current
norm2.advance()
}
if numeric, norm1.isEmpty, norm2.isEmpty, c1.intValue != nil, c2.intValue != nil {
let value1 = self.consumeNumbers(from: &idx1, initialValue: 0)
let value2 = other.consumeNumbers(from: &idx2, initialValue: 0)
if value1 == value2 {
if forceOrdering {
let dist1 = self.distance(from: startIndex, to: idx1)
let dist2 = other.distance(from: other.startIndex, to: idx2)
if dist1 != dist2 {
compareResult = ComparisonResult(dist1, dist2)
}
}
continue
} else {
return ComparisonResult(value1, value2)
}
}
if diacriticsInsensitive && idx1 > startIndex {
var str1Skip = false
var str2Skip = false
if norm1.isEmpty && c1.isExtendCharacter {
c1 = c2
str1Skip = true
}
if norm2.isEmpty && c2.isExtendCharacter {
c2 = c1
str2Skip = true
}
if str1Skip != str2Skip {
if str1Skip {
other.formIndex(before: &idx2)
} else {
formIndex(before: &idx1)
}
}
}
if c1 != c2 {
if !(toHalfWidth || diacriticsInsensitive || caseFold) {
return ComparisonResult(c1, c2)
}
if forceOrdering && compareResult == .orderedSame {
compareResult = ComparisonResult(c1, c2)
}
if norm1.isEmpty {
let t1 = c1._transform(toHalfWidth: toHalfWidth, stripDiacritics: diacriticsInsensitive, caseFolding: caseFold)
if let first = t1.first {
c1 = first
norm1 = .init(t1)
norm1.advance()
}
}
if norm1.isEmpty && !norm2.isEmpty {
return ComparisonResult(c1, c2)
}
if norm2.isEmpty && (norm1.isEmpty || c1 != c2) {
let t2 = c2._transform(toHalfWidth: toHalfWidth, stripDiacritics: diacriticsInsensitive, caseFolding: caseFold)
if let first = t2.first {
c2 = first
norm2 = .init(t2)
norm2.advance()
}
if norm2.isEmpty || c1 != c2 {
return ComparisonResult(c1, c2)
}
}
if !norm1.isEmpty && !norm2.isEmpty {
while !norm1.isEnd && !norm2.isEnd {
if norm1.current != norm2.current {
break
}
norm1.advance()
norm2.advance()
}
if !norm1.isEnd && !norm2.isEnd {
return ComparisonResult(norm1.current, norm2.current)
}
}
}
if !norm1.isEmpty && norm1.isEnd {
norm1.clear()
}
if !norm2.isEmpty && norm2.isEnd {
norm2.clear()
}
if norm1.isEmpty {
formIndex(after: &idx1)
}
if norm2.isEmpty {
other.formIndex(after: &idx2)
}
}
// Process the trailing diacritics, if there's any
if diacriticsInsensitive {
self._consumeExtendCharacters(from: &idx1)
other._consumeExtendCharacters(from: &idx2)
}
let result = ComparisonResult(stringIndex: idx1, idx2: idx2, endIndex1: endIndex, endIndex2: other.endIndex)
return result == .orderedSame ? compareResult : result
}
func _range<S: BidirectionalCollection>(of strToFind: S, toHalfWidth: Bool, diacriticsInsensitive: Bool, caseFold: Bool, anchored: Bool, backwards: Bool) -> Range<Index>? where S.Index == Index, S.Element == Element {
if !toHalfWidth && !diacriticsInsensitive && !caseFold {
return _range(of: strToFind, anchored: anchored, backwards: backwards)
}
// These options may cause the string to change their count
let lengthVariants = caseFold || diacriticsInsensitive
var fromLoc: Index
var toLoc: Index
if backwards {
if lengthVariants {
fromLoc = index(endIndex, offsetBy: -1)
} else {
guard let idx = _index(endIndex, backwardsOffsetByCountOf: strToFind) else {
return nil
}
fromLoc = idx
}
toLoc = (anchored && !lengthVariants) ? fromLoc : startIndex
} else {
fromLoc = startIndex
if anchored {
toLoc = fromLoc
} else if lengthVariants {
toLoc = index(endIndex, offsetBy: -1)
} else {
guard let idx = _index(endIndex, backwardsOffsetByCountOf: strToFind) else {
return nil
}
toLoc = idx
}
}
let delta = fromLoc <= toLoc ? 1 : -1
var result: Range<Index>? = nil
while true {
// Outer loop: loops through `self`
var str1Char: Element
var str2Char: Element
var str1Index = fromLoc
var str2Index = strToFind.startIndex
var useStrBuf1 = false
var useStrBuf2 = false
var strBuf1 = _StringCompareOptionsIterableBuffer<Element.IterableType>()
var strBuf2 = _StringCompareOptionsIterableBuffer<Element.IterableType>()
while str2Index < strToFind.endIndex {
// Inner loop: loops through `strToFind`
if !useStrBuf1 {
if str1Index == endIndex {
break
}
str1Char = self[str1Index]
} else {
str1Char = strBuf1.current
strBuf1.advance()
}
if !useStrBuf2 {
str2Char = strToFind[str2Index]
} else {
str2Char = strBuf2.current
strBuf2.advance()
}
if str1Char != str2Char {
if !useStrBuf1 {
let transformed = str1Char._transform(toHalfWidth: toHalfWidth, stripDiacritics: diacriticsInsensitive, caseFolding: caseFold)
if let c = transformed.first {
str1Char = c
strBuf1 = .init(transformed)
strBuf1.advance()
useStrBuf1 = true
}
}
if !useStrBuf1 && useStrBuf2 { break }
if !useStrBuf2 && (!useStrBuf1 || str1Char != str2Char) {
let transformed = str2Char._transform(toHalfWidth: toHalfWidth, stripDiacritics: diacriticsInsensitive, caseFolding: caseFold)
if let c = transformed.first {
str2Char = c
strBuf2 = .init(transformed)
strBuf2.advance()
useStrBuf2 = true
}
if str1Char != transformed.first {
break
}
}
}
if useStrBuf1 && useStrBuf2 {
while !strBuf1.isEnd && !strBuf2.isEnd {
if strBuf1.current != strBuf2.current {
break
}
strBuf1.advance()
strBuf2.advance()
}
if !strBuf1.isEnd && !strBuf2.isEnd {
break
}
}
if useStrBuf1 && strBuf1.isEnd {
useStrBuf1 = false
}
if useStrBuf2 && strBuf2.isEnd {
useStrBuf2 = false
}
if !useStrBuf1 {
formIndex(after: &str1Index)
}
if !useStrBuf2 {
strToFind.formIndex(after: &str2Index)
}
}
if str2Index == strToFind.endIndex {
// If `self` has extended characters following the lastly matched character, consume these
var match = true
if useStrBuf1 {
// if strToFind matches the string after transformed (strBuf1), try consuming extended characters from the buffer first
match = false
if diacriticsInsensitive {
strBuf1._consumeExtendCharacters()
}
if strBuf1.isEnd {
formIndex(after: &str1Index)
match = true
}
}
// After using up strBuf1, inspect the rest of original strings in `self`
if match && diacriticsInsensitive && str1Index < endIndex {
_consumeExtendCharacters(from: &str1Index)
}
if match {
if !(anchored && backwards) || str1Index == endIndex {
result = fromLoc..<str1Index
}
break
}
}
if fromLoc == toLoc { break }
formIndex(&fromLoc, offsetBy: delta)
}
return result
}
}
extension String : _StringCompareOptionsIterable {}
extension Substring: _StringCompareOptionsIterable {}
extension String.UnicodeScalarView: _StringCompareOptionsIterable {}
extension Substring.UnicodeScalarView: _StringCompareOptionsIterable {}
extension String.UTF8View: _StringCompareOptionsIterable {
init() {
self = String().utf8
}
}
extension Substring.UTF8View: _StringCompareOptionsIterable {
init() {
self = Substring().utf8
}
}
extension Unicode.UTF8.CodeUnit : _StringCompareOptionsConvertible {
func _transform(toHalfWidth: Bool, stripDiacritics: Bool, caseFolding: Bool) -> String.UTF8View {
String(unsafeUninitializedCapacity: 1) {
$0[0] = caseFolding ? self._lowercased : self
return 1
}.utf8
}
var intValue: Int? {
return (self >= 48 || self <= 57) ? Int(self - 48) : nil
}
var isExtendCharacter: Bool {
// This won't really get called and will be removed in a future PR
return false
}
}
extension Character : _StringCompareOptionsConvertible {
func _transform(toHalfWidth: Bool, stripDiacritics: Bool, caseFolding: Bool) -> String {
if isASCII {
// we only need to handle case folding, in which case is just lower case
return caseFolding ? lowercased() : String(self)
}
var new = ""
for scalar in unicodeScalars {
var tmp = scalar
if toHalfWidth {
tmp = scalar._toHalfWidth()
}
if stripDiacritics {
if scalar._isGraphemeExtend {
// skip this
continue
} else {
tmp = tmp._stripDiacritics()
}
}
if caseFolding {
new += tmp._caseFoldMapping
} else {
new += String(tmp)
}
}
return String(new)
}
var intValue: Int? {
return wholeNumberValue
}
var isExtendCharacter: Bool {
guard !self.isASCII else {
return false
}
return unicodeScalars.allSatisfy { $0._isGraphemeExtend }
}
}
extension UnicodeScalar : _StringCompareOptionsConvertible {
func _transform(toHalfWidth: Bool, stripDiacritics: Bool, caseFolding: Bool) -> String.UnicodeScalarView {
var new = self
if toHalfWidth {
new = new._toHalfWidth()
}
if stripDiacritics {
if new._isGraphemeExtend {
return String.UnicodeScalarView()
} else {
new = new._stripDiacritics()
}
}
if caseFolding {
return new._caseFoldMapping.unicodeScalars
} else {
return String(new).unicodeScalars
}
}
var intValue: Int? {
guard let v = properties.numericValue else {
return nil
}
return Int(v)
}
var isExtendCharacter: Bool {
return _isGraphemeExtend
}
}
// MARK: - _StringCompareOptionsIterableBuffer
internal struct _StringCompareOptionsIterableBuffer<StorageType: _StringCompareOptionsIterable> {
var _buf: StorageType
var _index: StorageType.Index
init() {
_buf = StorageType()
_index = _buf.startIndex
}
init(_ content: StorageType) {
_buf = content
_index = _buf.startIndex
}
var current: StorageType.Element {
return _buf[_index]
}
mutating func advance() {
_buf.formIndex(after: &_index)
}
var isEnd: Bool {
return _index == _buf.endIndex
}
var isEmpty: Bool {
return _buf.isEmpty
}
mutating func _consumeExtendCharacters() {
_buf._consumeExtendCharacters(from: &_index)
}
mutating func clear() {
self = .init()
}
}
// MARK: Comparison Implementations
extension Substring {
func _unlocalizedCompare(other: Substring, options: String.CompareOptions) -> ComparisonResult {
if options.isEmpty {
return ComparisonResult(self, other)
}
let diacriticInsensitive = options.contains(.diacriticInsensitive)
let toHalfWidth = options.contains(.widthInsensitive)
let caseFold = options.contains(.caseInsensitive)
let numeric = options.contains(.numeric)
let forceOrdering = options.contains(.forcedOrdering)
var result: ComparisonResult
if options.contains(.literal) {
// Per documentation, literal means "Performs a byte-for-byte comparison. Differing literal sequences (such as composed character sequences) that would otherwise be considered equivalent are considered not to match." Therefore we're comparing the scalars rather than characters
result = unicodeScalars._compare(other.unicodeScalars, toHalfWidth: toHalfWidth, diacriticsInsensitive: diacriticInsensitive, caseFold: caseFold, numeric: numeric, forceOrdering: forceOrdering)
} else {
result = _compare(other, toHalfWidth: toHalfWidth, diacriticsInsensitive: diacriticInsensitive, caseFold: caseFold, numeric: numeric, forceOrdering: forceOrdering)
}
if result == .orderedSame && forceOrdering {
result = unicodeScalars._compare(other.unicodeScalars)
}
return result
}
#if FOUNDATION_FRAMEWORK
func _rangeOfCharacter(from set: CharacterSet, options: String.CompareOptions) -> Range<Index>? {
guard !isEmpty else { return nil }
return unicodeScalars._rangeOfCharacter(anchored: options.contains(.anchored), backwards: options.contains(.backwards), matchingPredicate: set.contains)
}
#endif
func _rangeOfCharacter(from set: BuiltInUnicodeScalarSet, options: String.CompareOptions) -> Range<Index>? {
guard !isEmpty else { return nil }
return unicodeScalars._rangeOfCharacter(anchored: options.contains(.anchored), backwards: options.contains(.backwards), matchingPredicate: set.contains)
}
func _range(of strToFind: Substring, options: String.CompareOptions) throws -> Range<Index>? {
#if !NO_REGEX
if options.contains(.regularExpression) {
guard let regex = try RegexPatternCache.cache.regex(for: String(strToFind), caseInsensitive: options.contains(.caseInsensitive)) else {
return nil
}
if options.contains(.anchored) {
guard let match = prefixMatch(of: regex) else { return nil }
return match.range
} else {
guard let match = firstMatch(of: regex) else { return nil }
return match.range
}
}
#endif
guard !isEmpty, !strToFind.isEmpty else {
return nil
}
let toHalfWidth = options.contains(.widthInsensitive)
let diacriticsInsensitive = options.contains(.diacriticInsensitive)
let caseFold = options.contains(.caseInsensitive)
let anchored = options.contains(.anchored)
let backwards = options.contains(.backwards)
let result: Range<Index>?
if options.contains(.literal) {
result = unicodeScalars._range(of: strToFind.unicodeScalars, toHalfWidth: toHalfWidth, diacriticsInsensitive: diacriticsInsensitive, caseFold: caseFold, anchored: anchored, backwards: backwards)
} else {
result = _range(of: strToFind, toHalfWidth: toHalfWidth, diacriticsInsensitive: diacriticsInsensitive, caseFold: caseFold, anchored: anchored, backwards: backwards)
}
return result
}
func _components(separatedBy separator: Substring, options: String.CompareOptions = []) throws -> [String] {
var result = [String]()
try _enumerateComponents(separatedBy: separator, options: options) { substr, _ in
result.append(String(substr))
}
return result
}
// Only throws when using `.regularExpression` option
package func _enumerateComponents(separatedBy separator: Substring, options: String.CompareOptions, withBlock block: (_ component: Substring, _ isLastComponent: Bool) -> ()) throws {
var searchStart = startIndex
while searchStart < endIndex {
let r = try self[searchStart...]._range(of: separator, options: options)
guard let r, !r.isEmpty else {
break
}
block(self[searchStart ..< r.lowerBound], false)
searchStart = r.upperBound
}
block(self[searchStart..<endIndex], true)
}
}
extension Substring.UnicodeScalarView {
func _compare(_ other: Substring.UnicodeScalarView) -> ComparisonResult {
var idx1 = startIndex
var idx2 = other.startIndex
var scalar1: Unicode.Scalar
var scalar2: Unicode.Scalar
while idx1 < endIndex && idx2 < other.endIndex {
scalar1 = self[idx1]
scalar2 = other[idx2]
if scalar1 == scalar2 {
self.formIndex(after: &idx1)
other.formIndex(after: &idx2)
continue
} else {
return ComparisonResult(scalar1, scalar2)
}
}
return ComparisonResult(stringIndex: idx1, idx2: idx2, endIndex1: endIndex, endIndex2: other.endIndex)
}
func _rangeOfCharacter(anchored: Bool, backwards: Bool, matchingPredicate predicate: (Unicode.Scalar) -> Bool) -> Range<Index>? {
guard !isEmpty else { return nil }
let fromLoc: String.Index
let toLoc: String.Index
let step: Int
if backwards {
fromLoc = index(before: endIndex)
toLoc = anchored ? fromLoc : startIndex
step = -1
} else {
fromLoc = startIndex
toLoc = anchored ? fromLoc : index(before: endIndex)
step = 1
}
var done = false
var found = false
var idx = fromLoc
while !done {
let ch = self[idx]
if predicate(ch) {
done = true
found = true
} else if idx == toLoc {
done = true
} else {
formIndex(&idx, offsetBy: step)
}
}
guard found else { return nil }
return idx..<index(after: idx)
}
}
// MARK: - ComparisonResult Extension
extension ComparisonResult {
init<Index: Equatable>(stringIndex idx1: Index, idx2: Index, endIndex1: Index, endIndex2: Index) {
if idx1 == endIndex1 && idx2 == endIndex2 {
self = .orderedSame
} else if idx1 == endIndex1 {
self = .orderedAscending
} else {
self = .orderedDescending
}
}
init<T: Comparable>(_ t1: T, _ t2: T) {
if t1 < t2 {
self = .orderedAscending
} else if t1 > t2 {
self = .orderedDescending
} else {
self = .orderedSame
}
}
}
// Borrowed from stdlib
internal func _allASCII(_ input: UnsafeBufferPointer<UInt8>) -> Bool {
if input.isEmpty { return true }
let ptr = input.baseAddress.unsafelyUnwrapped
var i = 0
let count = input.count
let stride = MemoryLayout<UInt>.stride
let address = Int(bitPattern: ptr)
let wordASCIIMask = UInt(truncatingIfNeeded: 0x8080_8080_8080_8080 as UInt64)
let byteASCIIMask = UInt8(truncatingIfNeeded: wordASCIIMask)
while (address &+ i) % stride != 0 && i < count {
guard ptr[i] & byteASCIIMask == 0 else { return false }
i &+= 1
}
while (i &+ stride) <= count {
let word: UInt = UnsafePointer(bitPattern: address &+ i).unsafelyUnwrapped.pointee
guard word & wordASCIIMask == 0 else { return false }
i &+= stride
}
while i < count {
guard ptr[i] & byteASCIIMask == 0 else { return false }
i &+= 1
}
return true
}