//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _LIBCUDACXX_ALGORITHM
#define _LIBCUDACXX_ALGORITHM

/*
    algorithm synopsis

#include <initializer_list>

namespace std
{

template <class InputIterator, class Predicate>
    constexpr bool     // constexpr in C++20
    all_of(InputIterator first, InputIterator last, Predicate pred);

template <class InputIterator, class Predicate>
    constexpr bool     // constexpr in C++20
    any_of(InputIterator first, InputIterator last, Predicate pred);

template <class InputIterator, class Predicate>
    constexpr bool     // constexpr in C++20
    none_of(InputIterator first, InputIterator last, Predicate pred);

template <class InputIterator, class Function>
    constexpr Function          // constexpr in C++20
    for_each(InputIterator first, InputIterator last, Function f);

template<class InputIterator, class Size, class Function>
    constexpr InputIterator     // constexpr in C++20
    for_each_n(InputIterator first, Size n, Function f); // C++17

template <class InputIterator, class T>
    constexpr InputIterator     // constexpr in C++20
    find(InputIterator first, InputIterator last, const T& value);

template <class InputIterator, class Predicate>
    constexpr InputIterator     // constexpr in C++20
    find_if(InputIterator first, InputIterator last, Predicate pred);

template<class InputIterator, class Predicate>
    InputIterator               // constexpr in C++20
    find_if_not(InputIterator first, InputIterator last, Predicate pred);

template <class ForwardIterator1, class ForwardIterator2>
    ForwardIterator1            // constexpr in C++20
    find_end(ForwardIterator1 first1, ForwardIterator1 last1,
             ForwardIterator2 first2, ForwardIterator2 last2);

template <class ForwardIterator1, class ForwardIterator2, class BinaryPredicate>
    ForwardIterator1            // constexpr in C++20
    find_end(ForwardIterator1 first1, ForwardIterator1 last1,
             ForwardIterator2 first2, ForwardIterator2 last2, BinaryPredicate pred);

template <class ForwardIterator1, class ForwardIterator2>
    constexpr ForwardIterator1  // constexpr in C++20
    find_first_of(ForwardIterator1 first1, ForwardIterator1 last1,
                  ForwardIterator2 first2, ForwardIterator2 last2);

template <class ForwardIterator1, class ForwardIterator2, class BinaryPredicate>
    constexpr ForwardIterator1  // constexpr in C++20
    find_first_of(ForwardIterator1 first1, ForwardIterator1 last1,
                  ForwardIterator2 first2, ForwardIterator2 last2, BinaryPredicate pred);

template <class ForwardIterator>
    constexpr ForwardIterator   // constexpr in C++20
    adjacent_find(ForwardIterator first, ForwardIterator last);

template <class ForwardIterator, class BinaryPredicate>
    constexpr ForwardIterator   // constexpr in C++20
    adjacent_find(ForwardIterator first, ForwardIterator last, BinaryPredicate pred);

template <class InputIterator, class T>
    constexpr typename iterator_traits<InputIterator>::difference_type  // constexpr in C++20
    count(InputIterator first, InputIterator last, const T& value);

template <class InputIterator, class Predicate>
    constexpr typename iterator_traits<InputIterator>::difference_type // constexpr in C++20
    count_if(InputIterator first, InputIterator last, Predicate pred);

template <class InputIterator1, class InputIterator2>
    constexpr pair<InputIterator1, InputIterator2>   // constexpr in C++20
    mismatch(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2);

template <class InputIterator1, class InputIterator2>
    constexpr pair<InputIterator1, InputIterator2>   // constexpr in C++20
    mismatch(InputIterator1 first1, InputIterator1 last1,
             InputIterator2 first2, InputIterator2 last2); // **C++14**

template <class InputIterator1, class InputIterator2, class BinaryPredicate>
    constexpr pair<InputIterator1, InputIterator2>   // constexpr in C++20
    mismatch(InputIterator1 first1, InputIterator1 last1,
             InputIterator2 first2, BinaryPredicate pred);

template <class InputIterator1, class InputIterator2, class BinaryPredicate>
    constexpr pair<InputIterator1, InputIterator2>   // constexpr in C++20
    mismatch(InputIterator1 first1, InputIterator1 last1,
             InputIterator2 first2, InputIterator2 last2,
             BinaryPredicate pred); // **C++14**

template <class InputIterator1, class InputIterator2>
    constexpr bool      // constexpr in C++20
    equal(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2);

template <class InputIterator1, class InputIterator2>
    constexpr bool      // constexpr in C++20
    equal(InputIterator1 first1, InputIterator1 last1,
          InputIterator2 first2, InputIterator2 last2); // **C++14**

template <class InputIterator1, class InputIterator2, class BinaryPredicate>
    constexpr bool      // constexpr in C++20
    equal(InputIterator1 first1, InputIterator1 last1,
          InputIterator2 first2, BinaryPredicate pred);

template <class InputIterator1, class InputIterator2, class BinaryPredicate>
    constexpr bool      // constexpr in C++20
    equal(InputIterator1 first1, InputIterator1 last1,
          InputIterator2 first2, InputIterator2 last2,
          BinaryPredicate pred); // **C++14**

template<class ForwardIterator1, class ForwardIterator2>
    constexpr bool      // constexpr in C++20
    is_permutation(ForwardIterator1 first1, ForwardIterator1 last1,
                   ForwardIterator2 first2);

template<class ForwardIterator1, class ForwardIterator2>
    constexpr bool      // constexpr in C++20
    is_permutation(ForwardIterator1 first1, ForwardIterator1 last1,
                   ForwardIterator2 first2, ForwardIterator2 last2); // **C++14**

template<class ForwardIterator1, class ForwardIterator2, class BinaryPredicate>
    constexpr bool      // constexpr in C++20
    is_permutation(ForwardIterator1 first1, ForwardIterator1 last1,
                   ForwardIterator2 first2, BinaryPredicate pred);

template<class ForwardIterator1, class ForwardIterator2, class BinaryPredicate>
    constexpr bool      // constexpr in C++20
    is_permutation(ForwardIterator1 first1, ForwardIterator1 last1,
                   ForwardIterator2 first2, ForwardIterator2 last2,
                   BinaryPredicate pred);  // **C++14**

template <class ForwardIterator1, class ForwardIterator2>
    constexpr ForwardIterator1      // constexpr in C++20
    search(ForwardIterator1 first1, ForwardIterator1 last1,
           ForwardIterator2 first2, ForwardIterator2 last2);

template <class ForwardIterator1, class ForwardIterator2, class BinaryPredicate>
    constexpr ForwardIterator1      // constexpr in C++20
    search(ForwardIterator1 first1, ForwardIterator1 last1,
           ForwardIterator2 first2, ForwardIterator2 last2, BinaryPredicate pred);

template <class ForwardIterator, class Size, class T>
    constexpr ForwardIterator       // constexpr in C++20
    search_n(ForwardIterator first, ForwardIterator last, Size count, const T& value);

template <class ForwardIterator, class Size, class T, class BinaryPredicate>
    constexpr ForwardIterator       // constexpr in C++20
    search_n(ForwardIterator first, ForwardIterator last,
             Size count, const T& value, BinaryPredicate pred);

template <class InputIterator, class OutputIterator>
    OutputIterator
    copy(InputIterator first, InputIterator last, OutputIterator result);

template<class InputIterator, class OutputIterator, class Predicate>
    OutputIterator
    copy_if(InputIterator first, InputIterator last,
            OutputIterator result, Predicate pred);

template<class InputIterator, class Size, class OutputIterator>
    OutputIterator
    copy_n(InputIterator first, Size n, OutputIterator result);

template <class BidirectionalIterator1, class BidirectionalIterator2>
    BidirectionalIterator2
    copy_backward(BidirectionalIterator1 first, BidirectionalIterator1 last,
                  BidirectionalIterator2 result);

template <class ForwardIterator1, class ForwardIterator2>
    ForwardIterator2
    swap_ranges(ForwardIterator1 first1, ForwardIterator1 last1, ForwardIterator2 first2);

template <class ForwardIterator1, class ForwardIterator2>
    void
    iter_swap(ForwardIterator1 a, ForwardIterator2 b);

template <class InputIterator, class OutputIterator, class UnaryOperation>
    constexpr OutputIterator      // constexpr in C++20
    transform(InputIterator first, InputIterator last, OutputIterator result, UnaryOperation op);

template <class InputIterator1, class InputIterator2, class OutputIterator, class BinaryOperation>
    constexpr OutputIterator      // constexpr in C++20
    transform(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2,
              OutputIterator result, BinaryOperation binary_op);

template <class ForwardIterator, class T>
    constexpr void      // constexpr in C++20
    replace(ForwardIterator first, ForwardIterator last, const T& old_value, const T& new_value);

template <class ForwardIterator, class Predicate, class T>
    constexpr void      // constexpr in C++20
    replace_if(ForwardIterator first, ForwardIterator last, Predicate pred, const T& new_value);

template <class InputIterator, class OutputIterator, class T>
    constexpr OutputIterator      // constexpr in C++20
    replace_copy(InputIterator first, InputIterator last, OutputIterator result,
                 const T& old_value, const T& new_value);

template <class InputIterator, class OutputIterator, class Predicate, class T>
    constexpr OutputIterator      // constexpr in C++20
    replace_copy_if(InputIterator first, InputIterator last, OutputIterator result, Predicate pred, const T& new_value);

template <class ForwardIterator, class T>
    constexpr void      // constexpr in C++20
    fill(ForwardIterator first, ForwardIterator last, const T& value);

template <class OutputIterator, class Size, class T>
    constexpr OutputIterator      // constexpr in C++20
    fill_n(OutputIterator first, Size n, const T& value);

template <class ForwardIterator, class Generator>
    constexpr void      // constexpr in C++20
    generate(ForwardIterator first, ForwardIterator last, Generator gen);

template <class OutputIterator, class Size, class Generator>
    constexpr OutputIterator      // constexpr in C++20
    generate_n(OutputIterator first, Size n, Generator gen);

template <class ForwardIterator, class T>
    constexpr ForwardIterator     // constexpr in C++20
    remove(ForwardIterator first, ForwardIterator last, const T& value);

template <class ForwardIterator, class Predicate>
    constexpr ForwardIterator     // constexpr in C++20
    remove_if(ForwardIterator first, ForwardIterator last, Predicate pred);

template <class InputIterator, class OutputIterator, class T>
    constexpr OutputIterator     // constexpr in C++20
    remove_copy(InputIterator first, InputIterator last, OutputIterator result, const T& value);

template <class InputIterator, class OutputIterator, class Predicate>
    constexpr OutputIterator     // constexpr in C++20
    remove_copy_if(InputIterator first, InputIterator last, OutputIterator result, Predicate pred);

template <class ForwardIterator>
    ForwardIterator
    unique(ForwardIterator first, ForwardIterator last);

template <class ForwardIterator, class BinaryPredicate>
    ForwardIterator
    unique(ForwardIterator first, ForwardIterator last, BinaryPredicate pred);

template <class InputIterator, class OutputIterator>
    OutputIterator
    unique_copy(InputIterator first, InputIterator last, OutputIterator result);

template <class InputIterator, class OutputIterator, class BinaryPredicate>
    OutputIterator
    unique_copy(InputIterator first, InputIterator last, OutputIterator result, BinaryPredicate pred);

template <class BidirectionalIterator>
    void
    reverse(BidirectionalIterator first, BidirectionalIterator last);

template <class BidirectionalIterator, class OutputIterator>
    constexpr OutputIterator       // constexpr in C++20
    reverse_copy(BidirectionalIterator first, BidirectionalIterator last, OutputIterator result);

template <class ForwardIterator>
    ForwardIterator
    rotate(ForwardIterator first, ForwardIterator middle, ForwardIterator last);

template <class ForwardIterator, class OutputIterator>
    OutputIterator
    rotate_copy(ForwardIterator first, ForwardIterator middle, ForwardIterator last, OutputIterator result);

template <class RandomAccessIterator>
    void
    random_shuffle(RandomAccessIterator first, RandomAccessIterator last); // deprecated in C++14, removed in C++17

template <class RandomAccessIterator, class RandomNumberGenerator>
    void
    random_shuffle(RandomAccessIterator first, RandomAccessIterator last,
                   RandomNumberGenerator& rand);  // deprecated in C++14, removed in C++17

template<class PopulationIterator, class SampleIterator,
         class Distance, class UniformRandomBitGenerator>
    SampleIterator sample(PopulationIterator first, PopulationIterator last,
                          SampleIterator out, Distance n,
                          UniformRandomBitGenerator&& g); // C++17

template<class RandomAccessIterator, class UniformRandomNumberGenerator>
    void shuffle(RandomAccessIterator first, RandomAccessIterator last,
                 UniformRandomNumberGenerator&& g);

template <class InputIterator, class Predicate>
    constexpr bool  // constexpr in C++20
    is_partitioned(InputIterator first, InputIterator last, Predicate pred);

template <class ForwardIterator, class Predicate>
    ForwardIterator
    partition(ForwardIterator first, ForwardIterator last, Predicate pred);

template <class InputIterator, class OutputIterator1,
          class OutputIterator2, class Predicate>
    constexpr pair<OutputIterator1, OutputIterator2>   // constexpr in C++20
    partition_copy(InputIterator first, InputIterator last,
                   OutputIterator1 out_true, OutputIterator2 out_false,
                   Predicate pred);

template <class ForwardIterator, class Predicate>
    ForwardIterator
    stable_partition(ForwardIterator first, ForwardIterator last, Predicate pred);

template<class ForwardIterator, class Predicate>
    constexpr ForwardIterator  // constexpr in C++20
    partition_point(ForwardIterator first, ForwardIterator last, Predicate pred);

template <class ForwardIterator>
    constexpr bool  // constexpr in C++20
    is_sorted(ForwardIterator first, ForwardIterator last);

template <class ForwardIterator, class Compare>
    bool
    is_sorted(ForwardIterator first, ForwardIterator last, Compare comp);

template<class ForwardIterator>
    constexpr ForwardIterator    // constexpr in C++20
    is_sorted_until(ForwardIterator first, ForwardIterator last);

template <class ForwardIterator, class Compare>
    constexpr ForwardIterator    // constexpr in C++20
    is_sorted_until(ForwardIterator first, ForwardIterator last, Compare comp);

template <class RandomAccessIterator>
    void
    sort(RandomAccessIterator first, RandomAccessIterator last);

template <class RandomAccessIterator, class Compare>
    void
    sort(RandomAccessIterator first, RandomAccessIterator last, Compare comp);

template <class RandomAccessIterator>
    void
    stable_sort(RandomAccessIterator first, RandomAccessIterator last);

template <class RandomAccessIterator, class Compare>
    void
    stable_sort(RandomAccessIterator first, RandomAccessIterator last, Compare comp);

template <class RandomAccessIterator>
    void
    partial_sort(RandomAccessIterator first, RandomAccessIterator middle, RandomAccessIterator last);

template <class RandomAccessIterator, class Compare>
    void
    partial_sort(RandomAccessIterator first, RandomAccessIterator middle, RandomAccessIterator last, Compare comp);

template <class InputIterator, class RandomAccessIterator>
    RandomAccessIterator
    partial_sort_copy(InputIterator first, InputIterator last,
                      RandomAccessIterator result_first, RandomAccessIterator result_last);

template <class InputIterator, class RandomAccessIterator, class Compare>
    RandomAccessIterator
    partial_sort_copy(InputIterator first, InputIterator last,
                      RandomAccessIterator result_first, RandomAccessIterator result_last, Compare comp);

template <class RandomAccessIterator>
    void
    nth_element(RandomAccessIterator first, RandomAccessIterator nth, RandomAccessIterator last);

template <class RandomAccessIterator, class Compare>
    void
    nth_element(RandomAccessIterator first, RandomAccessIterator nth, RandomAccessIterator last, Compare comp);

template <class ForwardIterator, class T>
    constexpr ForwardIterator                         // constexpr in C++20
    lower_bound(ForwardIterator first, ForwardIterator last, const T& value);

template <class ForwardIterator, class T, class Compare>
    constexpr ForwardIterator                         // constexpr in C++20
    lower_bound(ForwardIterator first, ForwardIterator last, const T& value, Compare comp);

template <class ForwardIterator, class T>
    constexpr ForwardIterator                         // constexpr in C++20
    upper_bound(ForwardIterator first, ForwardIterator last, const T& value);

template <class ForwardIterator, class T, class Compare>
    constexpr ForwardIterator                         // constexpr in C++20
    upper_bound(ForwardIterator first, ForwardIterator last, const T& value, Compare comp);

template <class ForwardIterator, class T>
    constexpr pair<ForwardIterator, ForwardIterator>  // constexpr in C++20
    equal_range(ForwardIterator first, ForwardIterator last, const T& value);

template <class ForwardIterator, class T, class Compare>
    constexpr pair<ForwardIterator, ForwardIterator>  // constexpr in C++20
    equal_range(ForwardIterator first, ForwardIterator last, const T& value, Compare comp);

template <class ForwardIterator, class T>
    constexpr bool                                    // constexpr in C++20
    binary_search(ForwardIterator first, ForwardIterator last, const T& value);

template <class ForwardIterator, class T, class Compare>
    constexpr bool                                    // constexpr in C++20
    binary_search(ForwardIterator first, ForwardIterator last, const T& value, Compare comp);

template <class InputIterator1, class InputIterator2, class OutputIterator>
    OutputIterator
    merge(InputIterator1 first1, InputIterator1 last1,
          InputIterator2 first2, InputIterator2 last2, OutputIterator result);

template <class InputIterator1, class InputIterator2, class OutputIterator, class Compare>
    OutputIterator
    merge(InputIterator1 first1, InputIterator1 last1,
          InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp);

template <class BidirectionalIterator>
    void
    inplace_merge(BidirectionalIterator first, BidirectionalIterator middle, BidirectionalIterator last);

template <class BidirectionalIterator, class Compare>
    void
    inplace_merge(BidirectionalIterator first, BidirectionalIterator middle, BidirectionalIterator last, Compare comp);

template <class InputIterator1, class InputIterator2>
    constexpr bool                                    // constexpr in C++20
    includes(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2);

template <class InputIterator1, class InputIterator2, class Compare>
    constexpr bool                                    // constexpr in C++20
    includes(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, Compare comp);

template <class InputIterator1, class InputIterator2, class OutputIterator>
    OutputIterator
    set_union(InputIterator1 first1, InputIterator1 last1,
              InputIterator2 first2, InputIterator2 last2, OutputIterator result);

template <class InputIterator1, class InputIterator2, class OutputIterator, class Compare>
    OutputIterator
    set_union(InputIterator1 first1, InputIterator1 last1,
              InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp);

template <class InputIterator1, class InputIterator2, class OutputIterator>
    constexpr OutputIterator                         // constexpr in C++20
    set_intersection(InputIterator1 first1, InputIterator1 last1,
                     InputIterator2 first2, InputIterator2 last2, OutputIterator result);

template <class InputIterator1, class InputIterator2, class OutputIterator, class Compare>
    constexpr OutputIterator                         // constexpr in C++20
    set_intersection(InputIterator1 first1, InputIterator1 last1,
                     InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp);

template <class InputIterator1, class InputIterator2, class OutputIterator>
    OutputIterator
    set_difference(InputIterator1 first1, InputIterator1 last1,
                   InputIterator2 first2, InputIterator2 last2, OutputIterator result);

template <class InputIterator1, class InputIterator2, class OutputIterator, class Compare>
    OutputIterator
    set_difference(InputIterator1 first1, InputIterator1 last1,
                   InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp);

template <class InputIterator1, class InputIterator2, class OutputIterator>
    OutputIterator
    set_symmetric_difference(InputIterator1 first1, InputIterator1 last1,
                             InputIterator2 first2, InputIterator2 last2, OutputIterator result);

template <class InputIterator1, class InputIterator2, class OutputIterator, class Compare>
    OutputIterator
    set_symmetric_difference(InputIterator1 first1, InputIterator1 last1,
                             InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp);

template <class RandomAccessIterator>
    void
    push_heap(RandomAccessIterator first, RandomAccessIterator last);

template <class RandomAccessIterator, class Compare>
    void
    push_heap(RandomAccessIterator first, RandomAccessIterator last, Compare comp);

template <class RandomAccessIterator>
    void
    pop_heap(RandomAccessIterator first, RandomAccessIterator last);

template <class RandomAccessIterator, class Compare>
    void
    pop_heap(RandomAccessIterator first, RandomAccessIterator last, Compare comp);

template <class RandomAccessIterator>
    void
    make_heap(RandomAccessIterator first, RandomAccessIterator last);

template <class RandomAccessIterator, class Compare>
    void
    make_heap(RandomAccessIterator first, RandomAccessIterator last, Compare comp);

template <class RandomAccessIterator>
    void
    sort_heap(RandomAccessIterator first, RandomAccessIterator last);

template <class RandomAccessIterator, class Compare>
    void
    sort_heap(RandomAccessIterator first, RandomAccessIterator last, Compare comp);

template <class RandomAccessIterator>
    constexpr bool   // constexpr in C++20
    is_heap(RandomAccessIterator first, RandomAccessiterator last);

template <class RandomAccessIterator, class Compare>
    constexpr bool   // constexpr in C++20
    is_heap(RandomAccessIterator first, RandomAccessiterator last, Compare comp);

template <class RandomAccessIterator>
    constexpr RandomAccessIterator   // constexpr in C++20
    is_heap_until(RandomAccessIterator first, RandomAccessiterator last);

template <class RandomAccessIterator, class Compare>
    constexpr RandomAccessIterator   // constexpr in C++20
    is_heap_until(RandomAccessIterator first, RandomAccessiterator last, Compare comp);

template <class ForwardIterator>
    ForwardIterator
    min_element(ForwardIterator first, ForwardIterator last);  // constexpr in C++14

template <class ForwardIterator, class Compare>
    ForwardIterator
    min_element(ForwardIterator first, ForwardIterator last, Compare comp);  // constexpr in C++14

template <class T>
    const T&
    min(const T& a, const T& b);  // constexpr in C++14

template <class T, class Compare>
    const T&
    min(const T& a, const T& b, Compare comp);  // constexpr in C++14

template<class T>
    T
    min(::std::initializer_list<T> t);  // constexpr in C++14

template<class T, class Compare>
    T
    min(::std::initializer_list<T> t, Compare comp);  // constexpr in C++14

template<class T>
    constexpr const T& clamp( const T& v, const T& lo, const T& hi );               // C++17

template<class T, class Compare>
    constexpr const T& clamp( const T& v, const T& lo, const T& hi, Compare comp ); // C++17

template <class ForwardIterator>
    ForwardIterator
    max_element(ForwardIterator first, ForwardIterator last);  // constexpr in C++14

template <class ForwardIterator, class Compare>
    ForwardIterator
    max_element(ForwardIterator first, ForwardIterator last, Compare comp);  // constexpr in C++14

template <class T>
    const T&
    max(const T& a, const T& b); // constexpr in C++14

template <class T, class Compare>
    const T&
    max(const T& a, const T& b, Compare comp);  // constexpr in C++14

template<class T>
    T
    max(::std::initializer_list<T> t);  // constexpr in C++14

template<class T, class Compare>
    T
    max(::std::initializer_list<T> t, Compare comp);  // constexpr in C++14

template<class ForwardIterator>
    pair<ForwardIterator, ForwardIterator>
    minmax_element(ForwardIterator first, ForwardIterator last);   // constexpr in C++14

template<class ForwardIterator, class Compare>
    pair<ForwardIterator, ForwardIterator>
    minmax_element(ForwardIterator first, ForwardIterator last, Compare comp);   // constexpr in C++14

template<class T>
    pair<const T&, const T&>
    minmax(const T& a, const T& b);  // constexpr in C++14

template<class T, class Compare>
    pair<const T&, const T&>
    minmax(const T& a, const T& b, Compare comp);  // constexpr in C++14

template<class T>
    pair<T, T>
    minmax(::std::initializer_list<T> t);  // constexpr in C++14

template<class T, class Compare>
    pair<T, T>
    minmax(::std::initializer_list<T> t, Compare comp);  // constexpr in C++14

template <class InputIterator1, class InputIterator2>
    constexpr bool     // constexpr in C++20
    lexicographical_compare(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2);

template <class InputIterator1, class InputIterator2, class Compare>
    constexpr bool     // constexpr in C++20
    lexicographical_compare(InputIterator1 first1, InputIterator1 last1,
                            InputIterator2 first2, InputIterator2 last2, Compare comp);

template <class BidirectionalIterator>
    bool
    next_permutation(BidirectionalIterator first, BidirectionalIterator last);

template <class BidirectionalIterator, class Compare>
    bool
    next_permutation(BidirectionalIterator first, BidirectionalIterator last, Compare comp);

template <class BidirectionalIterator>
    bool
    prev_permutation(BidirectionalIterator first, BidirectionalIterator last);

template <class BidirectionalIterator, class Compare>
    bool
    prev_permutation(BidirectionalIterator first, BidirectionalIterator last, Compare comp);

}  // std

*/
#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header

#include <cuda/std/__algorithm/adjacent_find.h>
#include <cuda/std/__algorithm/all_of.h>
#include <cuda/std/__algorithm/any_of.h>
#include <cuda/std/__algorithm/binary_search.h>
#include <cuda/std/__algorithm/clamp.h>
#include <cuda/std/__algorithm/comp.h>
#include <cuda/std/__algorithm/comp_ref_type.h>
#include <cuda/std/__algorithm/copy.h>
#include <cuda/std/__algorithm/copy_backward.h>
#include <cuda/std/__algorithm/copy_if.h>
#include <cuda/std/__algorithm/copy_n.h>
#include <cuda/std/__algorithm/count.h>
#include <cuda/std/__algorithm/count_if.h>
#include <cuda/std/__algorithm/equal.h>
#include <cuda/std/__algorithm/equal_range.h>
#include <cuda/std/__algorithm/fill.h>
#include <cuda/std/__algorithm/fill_n.h>
#include <cuda/std/__algorithm/find.h>
#include <cuda/std/__algorithm/find_end.h>
#include <cuda/std/__algorithm/find_first_of.h>
#include <cuda/std/__algorithm/find_if.h>
#include <cuda/std/__algorithm/find_if_not.h>
#include <cuda/std/__algorithm/for_each.h>
#include <cuda/std/__algorithm/for_each_n.h>
#include <cuda/std/__algorithm/generate.h>
#include <cuda/std/__algorithm/generate_n.h>
#include <cuda/std/__algorithm/half_positive.h>
#include <cuda/std/__algorithm/includes.h>
#include <cuda/std/__algorithm/is_heap.h>
#include <cuda/std/__algorithm/is_heap_until.h>
#include <cuda/std/__algorithm/is_partitioned.h>
#include <cuda/std/__algorithm/is_permutation.h>
#include <cuda/std/__algorithm/is_sorted.h>
#include <cuda/std/__algorithm/is_sorted_until.h>
#include <cuda/std/__algorithm/iter_swap.h>
#include <cuda/std/__algorithm/iterator_operations.h>
#include <cuda/std/__algorithm/lexicographical_compare.h>
#include <cuda/std/__algorithm/lower_bound.h>
#include <cuda/std/__algorithm/make_heap.h>
#include <cuda/std/__algorithm/make_projected.h>
#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__algorithm/max_element.h>
#include <cuda/std/__algorithm/merge.h>
#include <cuda/std/__algorithm/min.h>
#include <cuda/std/__algorithm/min_element.h>
#include <cuda/std/__algorithm/minmax.h>
#include <cuda/std/__algorithm/minmax_element.h>
#include <cuda/std/__algorithm/mismatch.h>
#include <cuda/std/__algorithm/move.h>
#include <cuda/std/__algorithm/move_backward.h>
#include <cuda/std/__algorithm/next_permutation.h>
#include <cuda/std/__algorithm/none_of.h>
#include <cuda/std/__algorithm/partial_sort.h>
#include <cuda/std/__algorithm/partial_sort_copy.h>
#include <cuda/std/__algorithm/partition.h>
#include <cuda/std/__algorithm/partition_copy.h>
#include <cuda/std/__algorithm/partition_point.h>
#include <cuda/std/__algorithm/pop_heap.h>
#include <cuda/std/__algorithm/prev_permutation.h>
#include <cuda/std/__algorithm/push_heap.h>
#include <cuda/std/__algorithm/ranges_iterator_concept.h>
#include <cuda/std/__algorithm/remove.h>
#include <cuda/std/__algorithm/remove_copy.h>
#include <cuda/std/__algorithm/remove_copy_if.h>
#include <cuda/std/__algorithm/remove_if.h>
#include <cuda/std/__algorithm/replace.h>
#include <cuda/std/__algorithm/replace_copy.h>
#include <cuda/std/__algorithm/replace_copy_if.h>
#include <cuda/std/__algorithm/replace_if.h>
#include <cuda/std/__algorithm/reverse.h>
#include <cuda/std/__algorithm/reverse_copy.h>
#include <cuda/std/__algorithm/rotate.h>
#include <cuda/std/__algorithm/rotate_copy.h>
#include <cuda/std/__algorithm/search.h>
#include <cuda/std/__algorithm/search_n.h>
#include <cuda/std/__algorithm/set_difference.h>
#include <cuda/std/__algorithm/set_intersection.h>
#include <cuda/std/__algorithm/set_symmetric_difference.h>
#include <cuda/std/__algorithm/set_union.h>
#include <cuda/std/__algorithm/shift_left.h>
#include <cuda/std/__algorithm/shift_right.h>
#include <cuda/std/__algorithm/sift_down.h>
#include <cuda/std/__algorithm/sort_heap.h>
#include <cuda/std/__algorithm/swap_ranges.h>
#include <cuda/std/__algorithm/transform.h>
#include <cuda/std/__algorithm/unique.h>
#include <cuda/std/__algorithm/unique_copy.h>
#include <cuda/std/__algorithm/upper_bound.h>
#include <cuda/std/__iterator/distance.h>
#include <cuda/std/__iterator/iterator_traits.h>
#include <cuda/std/__iterator/move_iterator.h>
#include <cuda/std/__iterator/next.h>
#include <cuda/std/__iterator/prev.h>
#include <cuda/std/__iterator/reverse_iterator.h>
#include <cuda/std/__iterator/wrap_iter.h>
#include <cuda/std/__memory/destruct_n.h>
#include <cuda/std/__memory/temporary_buffer.h>
#include <cuda/std/__type_traits/common_type.h>
#include <cuda/std/__type_traits/enable_if.h>
#include <cuda/std/__type_traits/is_integral.h>
#include <cuda/std/__type_traits/is_same.h>
#include <cuda/std/__type_traits/is_trivially_copy_assignable.h>
#include <cuda/std/__type_traits/make_unsigned.h>
#include <cuda/std/__type_traits/remove_const.h>
#include <cuda/std/bit>
#include <cuda/std/cstddef>
#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
#include <cuda/std/detail/libcxx/include/__debug>
#include <cuda/std/detail/libcxx/include/cstring>
#include <cuda/std/functional>
#include <cuda/std/initializer_list>
#include <cuda/std/type_traits>
#include <cuda/std/version>

_CCCL_PUSH_MACROS

_LIBCUDACXX_BEGIN_NAMESPACE_STD

#ifndef __cuda_std__

template <class _Predicate>
class __invert // invert the sense of a comparison
{
private:
  _Predicate __p_;

public:
  _LIBCUDACXX_HIDE_FROM_ABI __invert() {}

  _LIBCUDACXX_HIDE_FROM_ABI explicit __invert(_Predicate __p)
      : __p_(__p)
  {}

  template <class _T1>
  _LIBCUDACXX_HIDE_FROM_ABI bool operator()(const _T1& __x)
  {
    return !__p_(__x);
  }

  template <class _T1, class _T2>
  _LIBCUDACXX_HIDE_FROM_ABI bool operator()(const _T1& __x, const _T2& __y)
  {
    return __p_(__y, __x);
  }
};

// random_shuffle

// __independent_bits_engine

template <unsigned long long _Xp, size_t _Rp>
struct __log2_imp
{
  static const size_t value = _Xp & ((unsigned long long) (1) << _Rp) ? _Rp : __log2_imp<_Xp, _Rp - 1>::value;
};

template <unsigned long long _Xp>
struct __log2_imp<_Xp, 0>
{
  static const size_t value = 0;
};

template <size_t _Rp>
struct __log2_imp<0, _Rp>
{
  static const size_t value = _Rp + 1;
};

template <class _UIntType, _UIntType _Xp>
struct __log2
{
  static const size_t value = __log2_imp<_Xp, sizeof(_UIntType) * __CHAR_BIT__ - 1>::value;
};

template <class _Engine, class _UIntType>
class __independent_bits_engine
{
public:
  // types
  typedef _UIntType result_type;

private:
  typedef typename _Engine::result_type _Engine_result_type;
  typedef __conditional_t<sizeof(_Engine_result_type) <= sizeof(result_type), result_type, _Engine_result_type>
    _Working_result_type;

  _Engine& __e_;
  size_t __w_;
  size_t __w0_;
  size_t __n_;
  size_t __n0_;
  _Working_result_type __y0_;
  _Working_result_type __y1_;
  _Engine_result_type __mask0_;
  _Engine_result_type __mask1_;

  static constexpr _Working_result_type _Rp = _Engine::max() - _Engine::min() + _Working_result_type(1);
  static constexpr size_t __m               = __log2<_Working_result_type, _Rp>::value;
  static constexpr size_t _WDt              = numeric_limits<_Working_result_type>::digits;
  static constexpr size_t _EDt              = numeric_limits<_Engine_result_type>::digits;

public:
  // constructors and seeding functions
  __independent_bits_engine(_Engine& __e, size_t __w);

  // generating functions
  result_type operator()()
  {
    return __eval(integral_constant<bool, _Rp != 0>());
  }

private:
  result_type __eval(false_type);
  result_type __eval(true_type);
};

template <class _Engine, class _UIntType>
__independent_bits_engine<_Engine, _UIntType>::__independent_bits_engine(_Engine& __e, size_t __w)
    : __e_(__e)
    , __w_(__w)
{
  __n_  = __w_ / __m + (__w_ % __m != 0);
  __w0_ = __w_ / __n_;
  if (_Rp == 0)
  {
    __y0_ = _Rp;
  }
  else if (__w0_ < _WDt)
  {
    __y0_ = (_Rp >> __w0_) << __w0_;
  }
  else
  {
    __y0_ = 0;
  }
  if (_Rp - __y0_ > __y0_ / __n_)
  {
    ++__n_;
    __w0_ = __w_ / __n_;
    if (__w0_ < _WDt)
    {
      __y0_ = (_Rp >> __w0_) << __w0_;
    }
    else
    {
      __y0_ = 0;
    }
  }
  __n0_ = __n_ - __w_ % __n_;
  if (__w0_ < _WDt - 1)
  {
    __y1_ = (_Rp >> (__w0_ + 1)) << (__w0_ + 1);
  }
  else
  {
    __y1_ = 0;
  }
  __mask0_ = __w0_ > 0 ? _Engine_result_type(~0) >> (_EDt - __w0_) : _Engine_result_type(0);
  __mask1_ = __w0_ < _EDt - 1 ? _Engine_result_type(~0) >> (_EDt - (__w0_ + 1)) : _Engine_result_type(~0);
}

template <class _Engine, class _UIntType>
inline _UIntType __independent_bits_engine<_Engine, _UIntType>::__eval(false_type)
{
  return static_cast<result_type>(__e_() & __mask0_);
}

template <class _Engine, class _UIntType>
_UIntType __independent_bits_engine<_Engine, _UIntType>::__eval(true_type)
{
  const size_t _WRt = numeric_limits<result_type>::digits;
  result_type _Sp   = 0;
  for (size_t __k = 0; __k < __n0_; ++__k)
  {
    _Engine_result_type __u;
    do
    {
      __u = __e_() - _Engine::min();
    } while (__u >= __y0_);
    if (__w0_ < _WRt)
    {
      _Sp <<= __w0_;
    }
    else
    {
      _Sp = 0;
    }
    _Sp += __u & __mask0_;
  }
  for (size_t __k = __n0_; __k < __n_; ++__k)
  {
    _Engine_result_type __u;
    do
    {
      __u = __e_() - _Engine::min();
    } while (__u >= __y1_);
    if (__w0_ < _WRt - 1)
    {
      _Sp <<= __w0_ + 1;
    }
    else
    {
      _Sp = 0;
    }
    _Sp += __u & __mask1_;
  }
  return _Sp;
}

// uniform_int_distribution

template <class _IntType = int>
class uniform_int_distribution
{
public:
  // types
  typedef _IntType result_type;

  class param_type
  {
    result_type __a_;
    result_type __b_;

  public:
    typedef uniform_int_distribution distribution_type;

    explicit param_type(result_type __a = 0, result_type __b = numeric_limits<result_type>::max())
        : __a_(__a)
        , __b_(__b)
    {}

    result_type a() const
    {
      return __a_;
    }
    result_type b() const
    {
      return __b_;
    }

    friend bool operator==(const param_type& __x, const param_type& __y)
    {
      return __x.__a_ == __y.__a_ && __x.__b_ == __y.__b_;
    }
    friend bool operator!=(const param_type& __x, const param_type& __y)
    {
      return !(__x == __y);
    }
  };

private:
  param_type __p_;

public:
  // constructors and reset functions
  explicit uniform_int_distribution(result_type __a = 0, result_type __b = numeric_limits<result_type>::max())
      : __p_(param_type(__a, __b))
  {}
  explicit uniform_int_distribution(const param_type& __p)
      : __p_(__p)
  {}
  void reset() {}

  // generating functions
  template <class _URNG>
  result_type operator()(_URNG& __g)
  {
    return (*this)(__g, __p_);
  }
  template <class _URNG>
  result_type operator()(_URNG& __g, const param_type& __p);

  // property functions
  result_type a() const
  {
    return __p_.a();
  }
  result_type b() const
  {
    return __p_.b();
  }

  param_type param() const
  {
    return __p_;
  }
  void param(const param_type& __p)
  {
    __p_ = __p;
  }

  result_type min() const
  {
    return a();
  }
  result_type max() const
  {
    return b();
  }

  friend bool operator==(const uniform_int_distribution& __x, const uniform_int_distribution& __y)
  {
    return __x.__p_ == __y.__p_;
  }
  friend bool operator!=(const uniform_int_distribution& __x, const uniform_int_distribution& __y)
  {
    return !(__x == __y);
  }
};

template <class _IntType>
template <class _URNG>
typename uniform_int_distribution<_IntType>::result_type uniform_int_distribution<_IntType>::operator()(
  _URNG& __g, const param_type& __p) _LIBCUDACXX_DISABLE_UBSAN_UNSIGNED_INTEGER_CHECK
{
  typedef __conditional_t<sizeof(result_type) <= sizeof(uint32_t), uint32_t, uint64_t> _UIntType;
  const _UIntType _Rp = _UIntType(__p.b()) - _UIntType(__p.a()) + _UIntType(1);
  if (_Rp == 1)
  {
    return __p.a();
  }
  const size_t _Dt = numeric_limits<_UIntType>::digits;
  typedef __independent_bits_engine<_URNG, _UIntType> _Eng;
  if (_Rp == 0)
  {
    return static_cast<result_type>(_Eng(__g, _Dt)());
  }
  size_t __w = _Dt - __libcpp_clz(_Rp) - 1;
  if ((_Rp & (std::numeric_limits<_UIntType>::max() >> (_Dt - __w))) != 0)
  {
    ++__w;
  }
  _Eng __e(__g, __w);
  _UIntType __u;
  do
  {
    __u = __e();
  } while (__u >= _Rp);
  return static_cast<result_type>(__u + __p.a());
}

template <class _PopulationIterator, class _SampleIterator, class _Distance, class _UniformRandomNumberGenerator>
_LIBCUDACXX_HIDE_FROM_ABI _SampleIterator __sample(
  _PopulationIterator __first,
  _PopulationIterator __last,
  _SampleIterator __output_iter,
  _Distance __n,
  _UniformRandomNumberGenerator& __g,
  input_iterator_tag)
{
  _Distance __k = 0;
  for (; __first != __last && __k < __n; ++__first, (void) ++__k)
  {
    __output_iter[__k] = *__first;
  }
  _Distance __sz = __k;
  for (; __first != __last; ++__first, (void) ++__k)
  {
    _Distance __r = _CUDA_VSTD::uniform_int_distribution<_Distance>(0, __k)(__g);
    if (__r < __sz)
    {
      __output_iter[__r] = *__first;
    }
  }
  return __output_iter + _CUDA_VSTD::min(__n, __k);
}

template <class _PopulationIterator, class _SampleIterator, class _Distance, class _UniformRandomNumberGenerator>
_LIBCUDACXX_HIDE_FROM_ABI _SampleIterator __sample(
  _PopulationIterator __first,
  _PopulationIterator __last,
  _SampleIterator __output_iter,
  _Distance __n,
  _UniformRandomNumberGenerator& __g,
  forward_iterator_tag)
{
  _Distance __unsampled_sz = _CUDA_VSTD::distance(__first, __last);
  for (__n = _CUDA_VSTD::min(__n, __unsampled_sz); __n != 0; ++__first)
  {
    _Distance __r = _CUDA_VSTD::uniform_int_distribution<_Distance>(0, --__unsampled_sz)(__g);
    if (__r < __n)
    {
      *__output_iter++ = *__first;
      --__n;
    }
  }
  return __output_iter;
}

template <class _PopulationIterator, class _SampleIterator, class _Distance, class _UniformRandomNumberGenerator>
_LIBCUDACXX_HIDE_FROM_ABI _SampleIterator __sample(
  _PopulationIterator __first,
  _PopulationIterator __last,
  _SampleIterator __output_iter,
  _Distance __n,
  _UniformRandomNumberGenerator& __g)
{
  typedef typename iterator_traits<_PopulationIterator>::iterator_category _PopCategory;
  typedef typename iterator_traits<_PopulationIterator>::difference_type _Difference;
  static_assert(__is_cpp17_forward_iterator<_PopulationIterator>::value
                  || __is_cpp17_random_access_iterator<_SampleIterator>::value,
                "SampleIterator must meet the requirements of RandomAccessIterator");
  typedef typename common_type<_Distance, _Difference>::type _CommonType;
  _LIBCUDACXX_ASSERT(__n >= 0, "N must be a positive number.");
  return _CUDA_VSTD::__sample(__first, __last, __output_iter, _CommonType(__n), __g, _PopCategory());
}

#  if _CCCL_STD_VER > 2014
template <class _PopulationIterator, class _SampleIterator, class _Distance, class _UniformRandomNumberGenerator>
_LIBCUDACXX_HIDE_FROM_ABI _SampleIterator sample(
  _PopulationIterator __first,
  _PopulationIterator __last,
  _SampleIterator __output_iter,
  _Distance __n,
  _UniformRandomNumberGenerator&& __g)
{
  return _CUDA_VSTD::__sample(__first, __last, __output_iter, __n, __g);
}
#  endif // _CCCL_STD_VER > 2014

template <class _RandomAccessIterator, class _UniformRandomNumberGenerator>
_LIBCUDACXX_HIDE_FROM_ABI void
shuffle(_RandomAccessIterator __first, _RandomAccessIterator __last, _UniformRandomNumberGenerator&& __g)
{
  typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
  typedef uniform_int_distribution<ptrdiff_t> _Dp;
  typedef typename _Dp::param_type _Pp;
  difference_type __d = __last - __first;
  if (__d > 1)
  {
    _Dp __uid;
    for (--__last, (void) --__d; __first < __last; ++__first, (void) --__d)
    {
      difference_type __i = __uid(__g, _Pp(0, __d));
      if (__i != difference_type(0))
      {
        swap(*__first, *(__first + __i));
      }
    }
  }
}

// stable_partition

template <class _Predicate, class _ForwardIterator, class _Distance, class _Pair>
_CCCL_HOST_DEVICE _ForwardIterator __stable_partition(
  _ForwardIterator __first,
  _ForwardIterator __last,
  _Predicate __pred,
  _Distance __len,
  _Pair __p,
  forward_iterator_tag __fit)
{
  // *__first is known to be false
  // __len >= 1
  if (__len == 1)
  {
    return __first;
  }
  if (__len == 2)
  {
    _ForwardIterator __m = __first;
    if (__pred(*++__m))
    {
      swap(*__first, *__m);
      return __m;
    }
    return __first;
  }
  if (__len <= __p.second)
  { // The buffer is big enough to use
    typedef typename iterator_traits<_ForwardIterator>::value_type value_type;
    __destruct_n __d(0);
    unique_ptr<value_type, __destruct_n&> __h(__p.first, __d);
    // Move the falses into the temporary buffer, and the trues to the front of the line
    // Update __first to always point to the end of the trues
    value_type* __t = __p.first;
    ::new (__t) value_type(_CUDA_VSTD::move(*__first));
    __d.__incr((value_type*) 0);
    ++__t;
    _ForwardIterator __i = __first;
    while (++__i != __last)
    {
      if (__pred(*__i))
      {
        *__first = _CUDA_VSTD::move(*__i);
        ++__first;
      }
      else
      {
        ::new (__t) value_type(_CUDA_VSTD::move(*__i));
        __d.__incr((value_type*) 0);
        ++__t;
      }
    }
    // All trues now at start of range, all falses in buffer
    // Move falses back into range, but don't mess up __first which points to first false
    __i = __first;
    for (value_type* __t2 = __p.first; __t2 < __t; ++__t2, (void) ++__i)
    {
      *__i = _CUDA_VSTD::move(*__t2);
    }
    // __h destructs moved-from values out of the temp buffer, but doesn't deallocate buffer
    return __first;
  }
  // Else not enough buffer, do in place
  // __len >= 3
  _ForwardIterator __m = __first;
  _Distance __len2     = __len / 2; // __len2 >= 2
  _CUDA_VSTD::advance(__m, __len2);
  // recurse on [__first, __m), *__first know to be false
  // F?????????????????
  // f       m         l
  typedef __add_lvalue_reference_t<_Predicate> _PredRef;
  _ForwardIterator __first_false = __stable_partition<_PredRef>(__first, __m, __pred, __len2, __p, __fit);
  // TTTFFFFF??????????
  // f  ff   m         l
  // recurse on [__m, __last], except increase __m until *(__m) is false, *__last know to be true
  _ForwardIterator __m1           = __m;
  _ForwardIterator __second_false = __last;
  _Distance __len_half            = __len - __len2;
  while (__pred(*__m1))
  {
    if (++__m1 == __last)
    {
      goto __second_half_done;
    }
    --__len_half;
  }
  // TTTFFFFFTTTF??????
  // f  ff   m  m1     l
  __second_false = __stable_partition<_PredRef>(__m1, __last, __pred, __len_half, __p, __fit);
__second_half_done:
  // TTTFFFFFTTTTTFFFFF
  // f  ff   m    sf   l
  return _CUDA_VSTD::rotate(__first_false, __m, __second_false);
  // TTTTTTTTFFFFFFFFFF
  //         |
}

template <class _Predicate, class _ForwardIterator>
_CCCL_HOST_DEVICE _ForwardIterator
__stable_partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred, forward_iterator_tag)
{
  const unsigned __alloc_limit = 3; // might want to make this a function of trivial assignment
  // Either prove all true and return __first or point to first false
  while (true)
  {
    if (__first == __last)
    {
      return __first;
    }
    if (!__pred(*__first))
    {
      break;
    }
    ++__first;
  }
  // We now have a reduced range [__first, __last)
  // *__first is known to be false
  typedef typename iterator_traits<_ForwardIterator>::difference_type difference_type;
  typedef typename iterator_traits<_ForwardIterator>::value_type value_type;
  difference_type __len = _CUDA_VSTD::distance(__first, __last);
  pair<value_type*, ptrdiff_t> __p(0, 0);
  unique_ptr<value_type, __return_temporary_buffer> __h;
  if (__len >= __alloc_limit)
  {
    __p = _CUDA_VSTD::get_temporary_buffer<value_type>(__len);
    __h.reset(__p.first);
  }
  return __stable_partition<__add_lvalue_reference_t<_Predicate>>(
    __first, __last, __pred, __len, __p, forward_iterator_tag());
}

template <class _Predicate, class _BidirectionalIterator, class _Distance, class _Pair>
_CCCL_HOST_DEVICE _BidirectionalIterator __stable_partition(
  _BidirectionalIterator __first,
  _BidirectionalIterator __last,
  _Predicate __pred,
  _Distance __len,
  _Pair __p,
  bidirectional_iterator_tag __bit)
{
  // *__first is known to be false
  // *__last is known to be true
  // __len >= 2
  if (__len == 2)
  {
    swap(*__first, *__last);
    return __last;
  }
  if (__len == 3)
  {
    _BidirectionalIterator __m = __first;
    if (__pred(*++__m))
    {
      swap(*__first, *__m);
      swap(*__m, *__last);
      return __last;
    }
    swap(*__m, *__last);
    swap(*__first, *__m);
    return __m;
  }
  if (__len <= __p.second)
  { // The buffer is big enough to use
    typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type;
    __destruct_n __d(0);
    unique_ptr<value_type, __destruct_n&> __h(__p.first, __d);
    // Move the falses into the temporary buffer, and the trues to the front of the line
    // Update __first to always point to the end of the trues
    value_type* __t = __p.first;
    ::new (__t) value_type(_CUDA_VSTD::move(*__first));
    __d.__incr((value_type*) 0);
    ++__t;
    _BidirectionalIterator __i = __first;
    while (++__i != __last)
    {
      if (__pred(*__i))
      {
        *__first = _CUDA_VSTD::move(*__i);
        ++__first;
      }
      else
      {
        ::new (__t) value_type(_CUDA_VSTD::move(*__i));
        __d.__incr((value_type*) 0);
        ++__t;
      }
    }
    // move *__last, known to be true
    *__first = _CUDA_VSTD::move(*__i);
    __i      = ++__first;
    // All trues now at start of range, all falses in buffer
    // Move falses back into range, but don't mess up __first which points to first false
    for (value_type* __t2 = __p.first; __t2 < __t; ++__t2, (void) ++__i)
    {
      *__i = _CUDA_VSTD::move(*__t2);
    }
    // __h destructs moved-from values out of the temp buffer, but doesn't deallocate buffer
    return __first;
  }
  // Else not enough buffer, do in place
  // __len >= 4
  _BidirectionalIterator __m = __first;
  _Distance __len2           = __len / 2; // __len2 >= 2
  _CUDA_VSTD::advance(__m, __len2);
  // recurse on [__first, __m-1], except reduce __m-1 until *(__m-1) is true, *__first know to be false
  // F????????????????T
  // f       m        l
  _BidirectionalIterator __m1          = __m;
  _BidirectionalIterator __first_false = __first;
  _Distance __len_half                 = __len2;
  while (!__pred(*--__m1))
  {
    if (__m1 == __first)
    {
      goto __first_half_done;
    }
    --__len_half;
  }
  // F???TFFF?????????T
  // f   m1  m        l
  typedef __add_lvalue_reference_t<_Predicate> _PredRef;
  __first_false = __stable_partition<_PredRef>(__first, __m1, __pred, __len_half, __p, __bit);
__first_half_done:
  // TTTFFFFF?????????T
  // f  ff   m        l
  // recurse on [__m, __last], except increase __m until *(__m) is false, *__last know to be true
  __m1                                  = __m;
  _BidirectionalIterator __second_false = __last;
  ++__second_false;
  __len_half = __len - __len2;
  while (__pred(*__m1))
  {
    if (++__m1 == __last)
    {
      goto __second_half_done;
    }
    --__len_half;
  }
  // TTTFFFFFTTTF?????T
  // f  ff   m  m1    l
  __second_false = __stable_partition<_PredRef>(__m1, __last, __pred, __len_half, __p, __bit);
__second_half_done:
  // TTTFFFFFTTTTTFFFFF
  // f  ff   m    sf  l
  return _CUDA_VSTD::rotate(__first_false, __m, __second_false);
  // TTTTTTTTFFFFFFFFFF
  //         |
}

template <class _Predicate, class _BidirectionalIterator>
_CCCL_HOST_DEVICE _BidirectionalIterator __stable_partition(
  _BidirectionalIterator __first, _BidirectionalIterator __last, _Predicate __pred, bidirectional_iterator_tag)
{
  typedef typename iterator_traits<_BidirectionalIterator>::difference_type difference_type;
  typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type;
  const difference_type __alloc_limit = 4; // might want to make this a function of trivial assignment
  // Either prove all true and return __first or point to first false
  while (true)
  {
    if (__first == __last)
    {
      return __first;
    }
    if (!__pred(*__first))
    {
      break;
    }
    ++__first;
  }
  // __first points to first false, everything prior to __first is already set.
  // Either prove [__first, __last) is all false and return __first, or point __last to last true
  do
  {
    if (__first == --__last)
    {
      return __first;
    }
  } while (!__pred(*__last));
  // We now have a reduced range [__first, __last]
  // *__first is known to be false
  // *__last is known to be true
  // __len >= 2
  difference_type __len = _CUDA_VSTD::distance(__first, __last) + 1;
  pair<value_type*, ptrdiff_t> __p(0, 0);
  unique_ptr<value_type, __return_temporary_buffer> __h;
  if (__len >= __alloc_limit)
  {
    __p = _CUDA_VSTD::get_temporary_buffer<value_type>(__len);
    __h.reset(__p.first);
  }
  return __stable_partition<__add_lvalue_reference_t<_Predicate>>(
    __first, __last, __pred, __len, __p, bidirectional_iterator_tag());
}

template <class _ForwardIterator, class _Predicate>
_LIBCUDACXX_HIDE_FROM_ABI _ForwardIterator
stable_partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred)
{
  return __stable_partition<__add_lvalue_reference_t<_Predicate>>(
    __first, __last, __pred, typename iterator_traits<_ForwardIterator>::iterator_category());
}

// sort

// stable, 2-3 compares, 0-2 swaps

template <class _Compare, class _ForwardIterator>
_CCCL_HOST_DEVICE unsigned __sort3(_ForwardIterator __x, _ForwardIterator __y, _ForwardIterator __z, _Compare __c)
{
  unsigned __r = 0;
  if (!__c(*__y, *__x)) // if x <= y
  {
    if (!__c(*__z, *__y)) // if y <= z
    {
      return __r; // x <= y && y <= z
                  // x <= y && y > z
    }
    swap(*__y, *__z); // x <= z && y < z
    __r = 1;
    if (__c(*__y, *__x)) // if x > y
    {
      swap(*__x, *__y); // x < y && y <= z
      __r = 2;
    }
    return __r; // x <= y && y < z
  }
  if (__c(*__z, *__y)) // x > y, if y > z
  {
    swap(*__x, *__z); // x < y && y < z
    __r = 1;
    return __r;
  }
  swap(*__x, *__y); // x > y && y <= z
  __r = 1; // x < y && x <= z
  if (__c(*__z, *__y)) // if y > z
  {
    swap(*__y, *__z); // x <= y && y < z
    __r = 2;
  }
  return __r;
} // x <= y && y <= z

// stable, 3-6 compares, 0-5 swaps

template <class _Compare, class _ForwardIterator>
_CCCL_HOST_DEVICE unsigned
__sort4(_ForwardIterator __x1, _ForwardIterator __x2, _ForwardIterator __x3, _ForwardIterator __x4, _Compare __c)
{
  unsigned __r = __sort3<_Compare>(__x1, __x2, __x3, __c);
  if (__c(*__x4, *__x3))
  {
    swap(*__x3, *__x4);
    ++__r;
    if (__c(*__x3, *__x2))
    {
      swap(*__x2, *__x3);
      ++__r;
      if (__c(*__x2, *__x1))
      {
        swap(*__x1, *__x2);
        ++__r;
      }
    }
  }
  return __r;
}

// stable, 4-10 compares, 0-9 swaps

template <class _Compare, class _ForwardIterator>
_CCCL_VISIBILITY_HIDDEN _CCCL_HOST_DEVICE unsigned __sort5(
  _ForwardIterator __x1,
  _ForwardIterator __x2,
  _ForwardIterator __x3,
  _ForwardIterator __x4,
  _ForwardIterator __x5,
  _Compare __c)
{
  unsigned __r = __sort4<_Compare>(__x1, __x2, __x3, __x4, __c);
  if (__c(*__x5, *__x4))
  {
    swap(*__x4, *__x5);
    ++__r;
    if (__c(*__x4, *__x3))
    {
      swap(*__x3, *__x4);
      ++__r;
      if (__c(*__x3, *__x2))
      {
        swap(*__x2, *__x3);
        ++__r;
        if (__c(*__x2, *__x1))
        {
          swap(*__x1, *__x2);
          ++__r;
        }
      }
    }
  }
  return __r;
}

// Assumes size > 0
template <class _Compare, class _BirdirectionalIterator>
_CCCL_HOST_DEVICE void __selection_sort(_BirdirectionalIterator __first, _BirdirectionalIterator __last, _Compare __comp)
{
  _BirdirectionalIterator __lm1 = __last;
  for (--__lm1; __first != __lm1; ++__first)
  {
    _BirdirectionalIterator __i =
      _CUDA_VSTD::min_element<_BirdirectionalIterator, __add_lvalue_reference_t<_Compare>>(__first, __last, __comp);
    if (__i != __first)
    {
      swap(*__first, *__i);
    }
  }
}

template <class _Compare, class _BirdirectionalIterator>
_CCCL_HOST_DEVICE void __insertion_sort(_BirdirectionalIterator __first, _BirdirectionalIterator __last, _Compare __comp)
{
  typedef typename iterator_traits<_BirdirectionalIterator>::value_type value_type;
  if (__first != __last)
  {
    _BirdirectionalIterator __i = __first;
    for (++__i; __i != __last; ++__i)
    {
      _BirdirectionalIterator __j = __i;
      value_type __t(_CUDA_VSTD::move(*__j));
      for (_BirdirectionalIterator __k = __i; __k != __first && __comp(__t, *--__k); --__j)
      {
        *__j = _CUDA_VSTD::move(*__k);
      }
      *__j = _CUDA_VSTD::move(__t);
    }
  }
}

template <class _Compare, class _RandomAccessIterator>
_CCCL_HOST_DEVICE void __insertion_sort_3(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
{
  typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
  _RandomAccessIterator __j = __first + 2;
  __sort3<_Compare>(__first, __first + 1, __j, __comp);
  for (_RandomAccessIterator __i = __j + 1; __i != __last; ++__i)
  {
    if (__comp(*__i, *__j))
    {
      value_type __t(_CUDA_VSTD::move(*__i));
      _RandomAccessIterator __k = __j;
      __j                       = __i;
      do
      {
        *__j = _CUDA_VSTD::move(*__k);
        __j  = __k;
      } while (__j != __first && __comp(__t, *--__k));
      *__j = _CUDA_VSTD::move(__t);
    }
    __j = __i;
  }
}

template <class _Compare, class _RandomAccessIterator>
_CCCL_HOST_DEVICE bool
__insertion_sort_incomplete(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
{
  switch (__last - __first)
  {
    case 0:
    case 1:
      return true;
    case 2:
      if (__comp(*--__last, *__first))
      {
        swap(*__first, *__last);
      }
      return true;
    case 3:
      _CUDA_VSTD::__sort3<_Compare>(__first, __first + 1, --__last, __comp);
      return true;
    case 4:
      _CUDA_VSTD::__sort4<_Compare>(__first, __first + 1, __first + 2, --__last, __comp);
      return true;
    case 5:
      _CUDA_VSTD::__sort5<_Compare>(__first, __first + 1, __first + 2, __first + 3, --__last, __comp);
      return true;
  }
  typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
  _RandomAccessIterator __j = __first + 2;
  __sort3<_Compare>(__first, __first + 1, __j, __comp);
  const unsigned __limit = 8;
  unsigned __count       = 0;
  for (_RandomAccessIterator __i = __j + 1; __i != __last; ++__i)
  {
    if (__comp(*__i, *__j))
    {
      value_type __t(_CUDA_VSTD::move(*__i));
      _RandomAccessIterator __k = __j;
      __j                       = __i;
      do
      {
        *__j = _CUDA_VSTD::move(*__k);
        __j  = __k;
      } while (__j != __first && __comp(__t, *--__k));
      *__j = _CUDA_VSTD::move(__t);
      if (++__count == __limit)
      {
        return ++__i == __last;
      }
    }
    __j = __i;
  }
  return true;
}

template <class _Compare, class _BirdirectionalIterator>
_CCCL_HOST_DEVICE void __insertion_sort_move(
  _BirdirectionalIterator __first1,
  _BirdirectionalIterator __last1,
  typename iterator_traits<_BirdirectionalIterator>::value_type* __first2,
  _Compare __comp)
{
  typedef typename iterator_traits<_BirdirectionalIterator>::value_type value_type;
  if (__first1 != __last1)
  {
    __destruct_n __d(0);
    unique_ptr<value_type, __destruct_n&> __h(__first2, __d);
    value_type* __last2 = __first2;
    ::new (__last2) value_type(_CUDA_VSTD::move(*__first1));
    __d.__incr((value_type*) 0);
    for (++__last2; ++__first1 != __last1; ++__last2)
    {
      value_type* __j2 = __last2;
      value_type* __i2 = __j2;
      if (__comp(*__first1, *--__i2))
      {
        ::new (__j2) value_type(_CUDA_VSTD::move(*__i2));
        __d.__incr((value_type*) 0);
        for (--__j2; __i2 != __first2 && __comp(*__first1, *--__i2); --__j2)
        {
          *__j2 = _CUDA_VSTD::move(*__i2);
        }
        *__j2 = _CUDA_VSTD::move(*__first1);
      }
      else
      {
        ::new (__j2) value_type(_CUDA_VSTD::move(*__first1));
        __d.__incr((value_type*) 0);
      }
    }
    __h.release();
  }
}

template <class _Compare, class _RandomAccessIterator>
_CCCL_HOST_DEVICE void __sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
{
  // _Compare is known to be a reference type
  typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
  typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
  const difference_type __limit =
    is_trivially_copy_constructible<value_type>::value && is_trivially_copy_assignable<value_type>::value ? 30 : 6;
  while (true)
  {
  __restart:
    difference_type __len = __last - __first;
    switch (__len)
    {
      case 0:
      case 1:
        return;
      case 2:
        if (__comp(*--__last, *__first))
        {
          swap(*__first, *__last);
        }
        return;
      case 3:
        _CUDA_VSTD::__sort3<_Compare>(__first, __first + 1, --__last, __comp);
        return;
      case 4:
        _CUDA_VSTD::__sort4<_Compare>(__first, __first + 1, __first + 2, --__last, __comp);
        return;
      case 5:
        _CUDA_VSTD::__sort5<_Compare>(__first, __first + 1, __first + 2, __first + 3, --__last, __comp);
        return;
    }
    if (__len <= __limit)
    {
      _CUDA_VSTD::__insertion_sort_3<_Compare>(__first, __last, __comp);
      return;
    }
    // __len > 5
    _RandomAccessIterator __m   = __first;
    _RandomAccessIterator __lm1 = __last;
    --__lm1;
    unsigned __n_swaps;
    {
      difference_type __delta;
      if (__len >= 1000)
      {
        __delta = __len / 2;
        __m += __delta;
        __delta /= 2;
        __n_swaps = _CUDA_VSTD::__sort5<_Compare>(__first, __first + __delta, __m, __m + __delta, __lm1, __comp);
      }
      else
      {
        __delta = __len / 2;
        __m += __delta;
        __n_swaps = _CUDA_VSTD::__sort3<_Compare>(__first, __m, __lm1, __comp);
      }
    }
    // *__m is median
    // partition [__first, __m) < *__m and *__m <= [__m, __last)
    // (this inhibits tossing elements equivalent to __m around unnecessarily)
    _RandomAccessIterator __i = __first;
    _RandomAccessIterator __j = __lm1;
    // j points beyond range to be tested, *__m is known to be <= *__lm1
    // The search going up is known to be guarded but the search coming down isn't.
    // Prime the downward search with a guard.
    if (!__comp(*__i, *__m)) // if *__first == *__m
    {
      // *__first == *__m, *__first doesn't go in first part
      // manually guard downward moving __j against __i
      while (true)
      {
        if (__i == --__j)
        {
          // *__first == *__m, *__m <= all other elements
          // Parition instead into [__first, __i) == *__first and *__first < [__i, __last)
          ++__i; // __first + 1
          __j = __last;
          if (!__comp(*__first, *--__j)) // we need a guard if *__first == *(__last-1)
          {
            while (true)
            {
              if (__i == __j)
              {
                return; // [__first, __last) all equivalent elements
              }
              if (__comp(*__first, *__i))
              {
                swap(*__i, *__j);
                ++__n_swaps;
                ++__i;
                break;
              }
              ++__i;
            }
          }
          // [__first, __i) == *__first and *__first < [__j, __last) and __j == __last - 1
          if (__i == __j)
          {
            return;
          }
          while (true)
          {
            while (!__comp(*__first, *__i))
            {
              ++__i;
            }
            while (__comp(*__first, *--__j))
              ;
            if (__i >= __j)
            {
              break;
            }
            swap(*__i, *__j);
            ++__n_swaps;
            ++__i;
          }
          // [__first, __i) == *__first and *__first < [__i, __last)
          // The first part is sorted, sort the secod part
          // _CUDA_VSTD::__sort<_Compare>(__i, __last, __comp);
          __first = __i;
          goto __restart;
        }
        if (__comp(*__j, *__m))
        {
          swap(*__i, *__j);
          ++__n_swaps;
          break; // found guard for downward moving __j, now use unguarded partition
        }
      }
    }
    // It is known that *__i < *__m
    ++__i;
    // j points beyond range to be tested, *__m is known to be <= *__lm1
    // if not yet partitioned...
    if (__i < __j)
    {
      // known that *(__i - 1) < *__m
      // known that __i <= __m
      while (true)
      {
        // __m still guards upward moving __i
        while (__comp(*__i, *__m))
        {
          ++__i;
        }
        // It is now known that a guard exists for downward moving __j
        while (!__comp(*--__j, *__m))
          ;
        if (__i > __j)
        {
          break;
        }
        swap(*__i, *__j);
        ++__n_swaps;
        // It is known that __m != __j
        // If __m just moved, follow it
        if (__m == __i)
        {
          __m = __j;
        }
        ++__i;
      }
    }
    // [__first, __i) < *__m and *__m <= [__i, __last)
    if (__i != __m && __comp(*__m, *__i))
    {
      swap(*__i, *__m);
      ++__n_swaps;
    }
    // [__first, __i) < *__i and *__i <= [__i+1, __last)
    // If we were given a perfect partition, see if insertion sort is quick...
    if (__n_swaps == 0)
    {
      bool __fs = _CUDA_VSTD::__insertion_sort_incomplete<_Compare>(__first, __i, __comp);
      if (_CUDA_VSTD::__insertion_sort_incomplete<_Compare>(__i + 1, __last, __comp))
      {
        if (__fs)
        {
          return;
        }
        __last = __i;
        continue;
      }
      else
      {
        if (__fs)
        {
          __first = ++__i;
          continue;
        }
      }
    }
    // sort smaller range with recursive call and larger with tail recursion elimination
    if (__i - __first < __last - __i)
    {
      _CUDA_VSTD::__sort<_Compare>(__first, __i, __comp);
      // _CUDA_VSTD::__sort<_Compare>(__i+1, __last, __comp);
      __first = ++__i;
    }
    else
    {
      _CUDA_VSTD::__sort<_Compare>(__i + 1, __last, __comp);
      // _CUDA_VSTD::__sort<_Compare>(__first, __i, __comp);
      __last = __i;
    }
  }
}

// This forwarder keeps the top call and the recursive calls using the same instantiation, forcing a reference _Compare
template <class _RandomAccessIterator, class _Compare>
_LIBCUDACXX_HIDE_FROM_ABI void sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
{
  using _Comp_ref = __comp_ref_type<_Compare>;
  _CUDA_VSTD::__sort<_Comp_ref>(__first, __last, _Comp_ref(__comp));
}

template <class _RandomAccessIterator>
_LIBCUDACXX_HIDE_FROM_ABI void sort(_RandomAccessIterator __first, _RandomAccessIterator __last)
{
  _CUDA_VSTD::sort(__first, __last, __less{});
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI void sort(_Tp** __first, _Tp** __last)
{
  _CUDA_VSTD::sort((size_t*) __first, (size_t*) __last, __less{});
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI void sort(__wrap_iter<_Tp*> __first, __wrap_iter<_Tp*> __last)
{
  _CUDA_VSTD::sort(__first.base(), __last.base());
}

template <class _Tp, class _Compare>
_LIBCUDACXX_HIDE_FROM_ABI void sort(__wrap_iter<_Tp*> __first, __wrap_iter<_Tp*> __last, _Compare __comp)
{
  typedef __add_lvalue_reference_t<_Compare> _Comp_ref;
  _CUDA_VSTD::sort<_Tp*, _Comp_ref>(__first.base(), __last.base(), __comp);
}

_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, char*>(char*, char*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, wchar_t*>(wchar_t*, wchar_t*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, signed*>(signed*, signed*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(
  _LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, unsigned char*>(unsigned char*, unsigned char*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, short*>(short*, short*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(
  _LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, unsigned short*>(unsigned short*, unsigned short*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, int*>(int*, int*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, unsigned*>(unsigned*, unsigned*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, long*>(long*, long*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(
  _LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, unsigned long*>(unsigned long*, unsigned long*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, long long*>(long long*, long long*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, unsigned long long*>(
  unsigned long long*, unsigned long long*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, float*>(float*, float*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, double*>(double*, double*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(
  _LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, long double*>(long double*, long double*, __less&))

_LIBCUDACXX_EXTERN_TEMPLATE(
  _LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, char*>(char*, char*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(
  _LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, wchar_t*>(wchar_t*, wchar_t*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, signed char*>(
  signed char*, signed char*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, unsigned char*>(
  unsigned char*, unsigned char*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(
  _LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, short*>(short*, short*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, unsigned short*>(
  unsigned short*, unsigned short*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(
  _LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, int*>(int*, int*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(
  _LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, unsigned*>(unsigned*, unsigned*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(
  _LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, long*>(long*, long*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, unsigned long*>(
  unsigned long*, unsigned long*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(
  _LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, long long*>(long long*, long long*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, unsigned long long*>(
  unsigned long long*, unsigned long long*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(
  _LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, float*>(float*, float*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(
  _LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, double*>(double*, double*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, long double*>(
  long double*, long double*, __less&))

_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI unsigned __sort5<__less&, long double*>(
  long double*, long double*, long double*, long double*, long double*, __less&))

// inplace_merge

template <class _Compare, class _InputIterator1, class _InputIterator2, class _OutputIterator>
_CCCL_HOST_DEVICE void __half_inplace_merge(
  _InputIterator1 __first1,
  _InputIterator1 __last1,
  _InputIterator2 __first2,
  _InputIterator2 __last2,
  _OutputIterator __result,
  _Compare __comp)
{
  for (; __first1 != __last1; ++__result)
  {
    if (__first2 == __last2)
    {
      _CUDA_VSTD::move(__first1, __last1, __result);
      return;
    }

    if (__comp(*__first2, *__first1))
    {
      *__result = _CUDA_VSTD::move(*__first2);
      ++__first2;
    }
    else
    {
      *__result = _CUDA_VSTD::move(*__first1);
      ++__first1;
    }
  }
  // __first2 through __last2 are already in the right spot.
}

template <class _Compare, class _BidirectionalIterator>
_CCCL_HOST_DEVICE void __buffered_inplace_merge(
  _BidirectionalIterator __first,
  _BidirectionalIterator __middle,
  _BidirectionalIterator __last,
  _Compare __comp,
  typename iterator_traits<_BidirectionalIterator>::difference_type __len1,
  typename iterator_traits<_BidirectionalIterator>::difference_type __len2,
  typename iterator_traits<_BidirectionalIterator>::value_type* __buff)
{
  typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type;
  __destruct_n __d(0);
  unique_ptr<value_type, __destruct_n&> __h2(__buff, __d);
  if (__len1 <= __len2)
  {
    value_type* __p = __buff;
    for (_BidirectionalIterator __i = __first; __i != __middle; __d.__incr((value_type*) 0), (void) ++__i, (void) ++__p)
    {
      ::new (__p) value_type(_CUDA_VSTD::move(*__i));
    }
    __half_inplace_merge(__buff, __p, __middle, __last, __first, __comp);
  }
  else
  {
    value_type* __p = __buff;
    for (_BidirectionalIterator __i = __middle; __i != __last; __d.__incr((value_type*) 0), (void) ++__i, (void) ++__p)
    {
      ::new (__p) value_type(_CUDA_VSTD::move(*__i));
    }
    typedef reverse_iterator<_BidirectionalIterator> _RBi;
    typedef reverse_iterator<value_type*> _Rv;
    __half_inplace_merge(_Rv(__p), _Rv(__buff), _RBi(__middle), _RBi(__first), _RBi(__last), __invert<_Compare>(__comp));
  }
}

template <class _Compare, class _BidirectionalIterator>
_CCCL_HOST_DEVICE void __inplace_merge(
  _BidirectionalIterator __first,
  _BidirectionalIterator __middle,
  _BidirectionalIterator __last,
  _Compare __comp,
  typename iterator_traits<_BidirectionalIterator>::difference_type __len1,
  typename iterator_traits<_BidirectionalIterator>::difference_type __len2,
  typename iterator_traits<_BidirectionalIterator>::value_type* __buff,
  ptrdiff_t __buff_size)
{
  typedef typename iterator_traits<_BidirectionalIterator>::difference_type difference_type;
  while (true)
  {
    // if __middle == __last, we're done
    if (__len2 == 0)
    {
      return;
    }
    if (__len1 <= __buff_size || __len2 <= __buff_size)
    {
      return __buffered_inplace_merge<_Compare>(__first, __middle, __last, __comp, __len1, __len2, __buff);
    }
    // shrink [__first, __middle) as much as possible (with no moves), returning if it shrinks to 0
    for (; true; ++__first, (void) --__len1)
    {
      if (__len1 == 0)
      {
        return;
      }
      if (__comp(*__middle, *__first))
      {
        break;
      }
    }
    // __first < __middle < __last
    // *__first > *__middle
    // partition [__first, __m1) [__m1, __middle) [__middle, __m2) [__m2, __last) such that
    //     all elements in:
    //         [__first, __m1)  <= [__middle, __m2)
    //         [__middle, __m2) <  [__m1, __middle)
    //         [__m1, __middle) <= [__m2, __last)
    //     and __m1 or __m2 is in the middle of its range
    _BidirectionalIterator __m1; // "median" of [__first, __middle)
    _BidirectionalIterator __m2; // "median" of [__middle, __last)
    difference_type __len11; // distance(__first, __m1)
    difference_type __len21; // distance(__middle, __m2)
    // binary search smaller range
    if (__len1 < __len2)
    { // __len >= 1, __len2 >= 2
      __len21 = __len2 / 2;
      __m2    = __middle;
      _CUDA_VSTD::advance(__m2, __len21);
      __m1    = __upper_bound<_Compare>(__first, __middle, *__m2, __comp);
      __len11 = _CUDA_VSTD::distance(__first, __m1);
    }
    else
    {
      if (__len1 == 1)
      { // __len1 >= __len2 && __len2 > 0, therefore __len2 == 1
        // It is known *__first > *__middle
        swap(*__first, *__middle);
        return;
      }
      // __len1 >= 2, __len2 >= 1
      __len11 = __len1 / 2;
      __m1    = __first;
      _CUDA_VSTD::advance(__m1, __len11);
      __m2    = __lower_bound<_Compare>(__middle, __last, *__m1, __comp);
      __len21 = _CUDA_VSTD::distance(__middle, __m2);
    }
    difference_type __len12 = __len1 - __len11; // distance(__m1, __middle)
    difference_type __len22 = __len2 - __len21; // distance(__m2, __last)
    // [__first, __m1) [__m1, __middle) [__middle, __m2) [__m2, __last)
    // swap middle two partitions
    __middle = _CUDA_VSTD::rotate(__m1, __middle, __m2);
    // __len12 and __len21 now have swapped meanings
    // merge smaller range with recurisve call and larger with tail recursion elimination
    if (__len11 + __len21 < __len12 + __len22)
    {
      __inplace_merge<_Compare>(__first, __m1, __middle, __comp, __len11, __len21, __buff, __buff_size);
      //          __inplace_merge<_Compare>(__middle, __m2, __last, __comp, __len12, __len22, __buff, __buff_size);
      __first  = __middle;
      __middle = __m2;
      __len1   = __len12;
      __len2   = __len22;
    }
    else
    {
      __inplace_merge<_Compare>(__middle, __m2, __last, __comp, __len12, __len22, __buff, __buff_size);
      //          __inplace_merge<_Compare>(__first, __m1, __middle, __comp, __len11, __len21, __buff, __buff_size);
      __last   = __middle;
      __middle = __m1;
      __len1   = __len11;
      __len2   = __len21;
    }
  }
}

template <class _BidirectionalIterator, class _Compare>
_LIBCUDACXX_HIDE_FROM_ABI void inplace_merge(
  _BidirectionalIterator __first, _BidirectionalIterator __middle, _BidirectionalIterator __last, _Compare __comp)
{
  typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type;
  typedef typename iterator_traits<_BidirectionalIterator>::difference_type difference_type;
  difference_type __len1             = _CUDA_VSTD::distance(__first, __middle);
  difference_type __len2             = _CUDA_VSTD::distance(__middle, __last);
  difference_type __buf_size         = _CUDA_VSTD::min(__len1, __len2);
  pair<value_type*, ptrdiff_t> __buf = _CUDA_VSTD::get_temporary_buffer<value_type>(__buf_size);
  unique_ptr<value_type, __return_temporary_buffer> __h(__buf.first);
  using _Comp_ref = __comp_ref_type<_Compare>;
  return _CUDA_VSTD::__inplace_merge<_Comp_ref>(
    __first, __middle, __last, __comp, __len1, __len2, __buf.first, __buf.second);
}

template <class _BidirectionalIterator>
_LIBCUDACXX_HIDE_FROM_ABI void
inplace_merge(_BidirectionalIterator __first, _BidirectionalIterator __middle, _BidirectionalIterator __last)
{
  _CUDA_VSTD::inplace_merge(__first, __middle, __last, __less{});
}

// stable_sort

template <class _Compare, class _InputIterator1, class _InputIterator2>
_CCCL_HOST_DEVICE void __merge_move_construct(
  _InputIterator1 __first1,
  _InputIterator1 __last1,
  _InputIterator2 __first2,
  _InputIterator2 __last2,
  typename iterator_traits<_InputIterator1>::value_type* __result,
  _Compare __comp)
{
  typedef typename iterator_traits<_InputIterator1>::value_type value_type;
  __destruct_n __d(0);
  unique_ptr<value_type, __destruct_n&> __h(__result, __d);
  for (; true; ++__result)
  {
    if (__first1 == __last1)
    {
      for (; __first2 != __last2; ++__first2, ++__result, (void) __d.__incr((value_type*) 0))
      {
        ::new (__result) value_type(_CUDA_VSTD::move(*__first2));
      }
      __h.release();
      return;
    }
    if (__first2 == __last2)
    {
      for (; __first1 != __last1; ++__first1, ++__result, (void) __d.__incr((value_type*) 0))
      {
        ::new (__result) value_type(_CUDA_VSTD::move(*__first1));
      }
      __h.release();
      return;
    }
    if (__comp(*__first2, *__first1))
    {
      ::new (__result) value_type(_CUDA_VSTD::move(*__first2));
      __d.__incr((value_type*) 0);
      ++__first2;
    }
    else
    {
      ::new (__result) value_type(_CUDA_VSTD::move(*__first1));
      __d.__incr((value_type*) 0);
      ++__first1;
    }
  }
}

template <class _Compare, class _InputIterator1, class _InputIterator2, class _OutputIterator>
_CCCL_HOST_DEVICE void __merge_move_assign(
  _InputIterator1 __first1,
  _InputIterator1 __last1,
  _InputIterator2 __first2,
  _InputIterator2 __last2,
  _OutputIterator __result,
  _Compare __comp)
{
  for (; __first1 != __last1; ++__result)
  {
    if (__first2 == __last2)
    {
      for (; __first1 != __last1; ++__first1, (void) ++__result)
      {
        *__result = _CUDA_VSTD::move(*__first1);
      }
      return;
    }
    if (__comp(*__first2, *__first1))
    {
      *__result = _CUDA_VSTD::move(*__first2);
      ++__first2;
    }
    else
    {
      *__result = _CUDA_VSTD::move(*__first1);
      ++__first1;
    }
  }
  for (; __first2 != __last2; ++__first2, (void) ++__result)
  {
    *__result = _CUDA_VSTD::move(*__first2);
  }
}

template <class _Compare, class _RandomAccessIterator>
_CCCL_HOST_DEVICE void __stable_sort(
  _RandomAccessIterator __first,
  _RandomAccessIterator __last,
  _Compare __comp,
  typename iterator_traits<_RandomAccessIterator>::difference_type __len,
  typename iterator_traits<_RandomAccessIterator>::value_type* __buff,
  ptrdiff_t __buff_size);

template <class _Compare, class _RandomAccessIterator>
_CCCL_HOST_DEVICE void __stable_sort_move(
  _RandomAccessIterator __first1,
  _RandomAccessIterator __last1,
  _Compare __comp,
  typename iterator_traits<_RandomAccessIterator>::difference_type __len,
  typename iterator_traits<_RandomAccessIterator>::value_type* __first2)
{
  typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
  switch (__len)
  {
    case 0:
      return;
    case 1:
      ::new (__first2) value_type(_CUDA_VSTD::move(*__first1));
      return;
    case 2:
      __destruct_n __d(0);
      unique_ptr<value_type, __destruct_n&> __h2(__first2, __d);
      if (__comp(*--__last1, *__first1))
      {
        ::new (__first2) value_type(_CUDA_VSTD::move(*__last1));
        __d.__incr((value_type*) 0);
        ++__first2;
        ::new (__first2) value_type(_CUDA_VSTD::move(*__first1));
      }
      else
      {
        ::new (__first2) value_type(_CUDA_VSTD::move(*__first1));
        __d.__incr((value_type*) 0);
        ++__first2;
        ::new (__first2) value_type(_CUDA_VSTD::move(*__last1));
      }
      __h2.release();
      return;
  }
  if (__len <= 8)
  {
    __insertion_sort_move<_Compare>(__first1, __last1, __first2, __comp);
    return;
  }
  typename iterator_traits<_RandomAccessIterator>::difference_type __l2 = __len / 2;
  _RandomAccessIterator __m                                             = __first1 + __l2;
  __stable_sort<_Compare>(__first1, __m, __comp, __l2, __first2, __l2);
  __stable_sort<_Compare>(__m, __last1, __comp, __len - __l2, __first2 + __l2, __len - __l2);
  __merge_move_construct<_Compare>(__first1, __m, __m, __last1, __first2, __comp);
}

template <class _Tp>
struct __stable_sort_switch
{
  static const unsigned value = 128 * is_trivially_copy_assignable<_Tp>::value;
};

template <class _Compare, class _RandomAccessIterator>
_CCCL_HOST_DEVICE void __stable_sort(
  _RandomAccessIterator __first,
  _RandomAccessIterator __last,
  _Compare __comp,
  typename iterator_traits<_RandomAccessIterator>::difference_type __len,
  typename iterator_traits<_RandomAccessIterator>::value_type* __buff,
  ptrdiff_t __buff_size)
{
  typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
  typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
  switch (__len)
  {
    case 0:
    case 1:
      return;
    case 2:
      if (__comp(*--__last, *__first))
      {
        swap(*__first, *__last);
      }
      return;
  }
  if (__len <= static_cast<difference_type>(__stable_sort_switch<value_type>::value))
  {
    __insertion_sort<_Compare>(__first, __last, __comp);
    return;
  }
  typename iterator_traits<_RandomAccessIterator>::difference_type __l2 = __len / 2;
  _RandomAccessIterator __m                                             = __first + __l2;
  if (__len <= __buff_size)
  {
    __destruct_n __d(0);
    unique_ptr<value_type, __destruct_n&> __h2(__buff, __d);
    __stable_sort_move<_Compare>(__first, __m, __comp, __l2, __buff);
    __d.__set(__l2, (value_type*) 0);
    __stable_sort_move<_Compare>(__m, __last, __comp, __len - __l2, __buff + __l2);
    __d.__set(__len, (value_type*) 0);
    __merge_move_assign<_Compare>(__buff, __buff + __l2, __buff + __l2, __buff + __len, __first, __comp);
    //         __merge<_Compare>(move_iterator<value_type*>(__buff),
    //                           move_iterator<value_type*>(__buff + __l2),
    //                           move_iterator<_RandomAccessIterator>(__buff + __l2),
    //                           move_iterator<_RandomAccessIterator>(__buff + __len),
    //                           __first, __comp);
    return;
  }
  __stable_sort<_Compare>(__first, __m, __comp, __l2, __buff, __buff_size);
  __stable_sort<_Compare>(__m, __last, __comp, __len - __l2, __buff, __buff_size);
  __inplace_merge<_Compare>(__first, __m, __last, __comp, __l2, __len - __l2, __buff, __buff_size);
}

template <class _RandomAccessIterator, class _Compare>
_LIBCUDACXX_HIDE_FROM_ABI void stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
{
  typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
  typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
  difference_type __len = __last - __first;
  pair<value_type*, ptrdiff_t> __buf(0, 0);
  unique_ptr<value_type, __return_temporary_buffer> __h;
  if (__len > static_cast<difference_type>(__stable_sort_switch<value_type>::value))
  {
    __buf = _CUDA_VSTD::get_temporary_buffer<value_type>(__len);
    __h.reset(__buf.first);
  }
  using _Comp_ref = __comp_ref_type<_Compare>;
  __stable_sort<_Comp_ref>(__first, __last, __comp, __len, __buf.first, __buf.second);
}

template <class _RandomAccessIterator>
_LIBCUDACXX_HIDE_FROM_ABI void stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last)
{
  _CUDA_VSTD::stable_sort(__first, __last, __less{});
}

// nth_element

template <class _Compare, class _RandomAccessIterator>
_CCCL_HOST_DEVICE void
__nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth, _RandomAccessIterator __last, _Compare __comp)
{
  // _Compare is known to be a reference type
  typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
  const difference_type __limit = 7;
  while (true)
  {
  __restart:
    if (__nth == __last)
    {
      return;
    }
    difference_type __len = __last - __first;
    switch (__len)
    {
      case 0:
      case 1:
        return;
      case 2:
        if (__comp(*--__last, *__first))
        {
          swap(*__first, *__last);
        }
        return;
      case 3: {
        _RandomAccessIterator __m = __first;
        _CUDA_VSTD::__sort3<_Compare>(__first, ++__m, --__last, __comp);
        return;
      }
    }
    if (__len <= __limit)
    {
      __selection_sort<_Compare>(__first, __last, __comp);
      return;
    }
    // __len > __limit >= 3
    _RandomAccessIterator __m   = __first + __len / 2;
    _RandomAccessIterator __lm1 = __last;
    unsigned __n_swaps          = _CUDA_VSTD::__sort3<_Compare>(__first, __m, --__lm1, __comp);
    // *__m is median
    // partition [__first, __m) < *__m and *__m <= [__m, __last)
    // (this inhibits tossing elements equivalent to __m around unnecessarily)
    _RandomAccessIterator __i = __first;
    _RandomAccessIterator __j = __lm1;
    // j points beyond range to be tested, *__lm1 is known to be <= *__m
    // The search going up is known to be guarded but the search coming down isn't.
    // Prime the downward search with a guard.
    if (!__comp(*__i, *__m)) // if *__first == *__m
    {
      // *__first == *__m, *__first doesn't go in first part
      // manually guard downward moving __j against __i
      while (true)
      {
        if (__i == --__j)
        {
          // *__first == *__m, *__m <= all other elements
          // Parition instead into [__first, __i) == *__first and *__first < [__i, __last)
          ++__i; // __first + 1
          __j = __last;
          if (!__comp(*__first, *--__j)) // we need a guard if *__first == *(__last-1)
          {
            while (true)
            {
              if (__i == __j)
              {
                return; // [__first, __last) all equivalent elements
              }
              if (__comp(*__first, *__i))
              {
                swap(*__i, *__j);
                ++__n_swaps;
                ++__i;
                break;
              }
              ++__i;
            }
          }
          // [__first, __i) == *__first and *__first < [__j, __last) and __j == __last - 1
          if (__i == __j)
          {
            return;
          }
          while (true)
          {
            while (!__comp(*__first, *__i))
            {
              ++__i;
            }
            while (__comp(*__first, *--__j))
              ;
            if (__i >= __j)
            {
              break;
            }
            swap(*__i, *__j);
            ++__n_swaps;
            ++__i;
          }
          // [__first, __i) == *__first and *__first < [__i, __last)
          // The first part is sorted,
          if (__nth < __i)
          {
            return;
          }
          // __nth_element the secod part
          // __nth_element<_Compare>(__i, __nth, __last, __comp);
          __first = __i;
          goto __restart;
        }
        if (__comp(*__j, *__m))
        {
          swap(*__i, *__j);
          ++__n_swaps;
          break; // found guard for downward moving __j, now use unguarded partition
        }
      }
    }
    ++__i;
    // j points beyond range to be tested, *__lm1 is known to be <= *__m
    // if not yet partitioned...
    if (__i < __j)
    {
      // known that *(__i - 1) < *__m
      while (true)
      {
        // __m still guards upward moving __i
        while (__comp(*__i, *__m))
        {
          ++__i;
        }
        // It is now known that a guard exists for downward moving __j
        while (!__comp(*--__j, *__m))
          ;
        if (__i >= __j)
        {
          break;
        }
        swap(*__i, *__j);
        ++__n_swaps;
        // It is known that __m != __j
        // If __m just moved, follow it
        if (__m == __i)
        {
          __m = __j;
        }
        ++__i;
      }
    }
    // [__first, __i) < *__m and *__m <= [__i, __last)
    if (__i != __m && __comp(*__m, *__i))
    {
      swap(*__i, *__m);
      ++__n_swaps;
    }
    // [__first, __i) < *__i and *__i <= [__i+1, __last)
    if (__nth == __i)
    {
      return;
    }
    if (__n_swaps == 0)
    {
      // We were given a perfectly partitioned sequence.  Coincidence?
      if (__nth < __i)
      {
        // Check for [__first, __i) already sorted
        __j = __m = __first;
        while (++__j != __i)
        {
          if (__comp(*__j, *__m))
          {
            // not yet sorted, so sort
            goto not_sorted;
          }
          __m = __j;
        }
        // [__first, __i) sorted
        return;
      }
      else
      {
        // Check for [__i, __last) already sorted
        __j = __m = __i;
        while (++__j != __last)
        {
          if (__comp(*__j, *__m))
          {
            // not yet sorted, so sort
            goto not_sorted;
          }
          __m = __j;
        }
        // [__i, __last) sorted
        return;
      }
    }
  not_sorted:
    // __nth_element on range containing __nth
    if (__nth < __i)
    {
      // __nth_element<_Compare>(__first, __nth, __i, __comp);
      __last = __i;
    }
    else
    {
      // __nth_element<_Compare>(__i+1, __nth, __last, __comp);
      __first = ++__i;
    }
  }
}

template <class _RandomAccessIterator, class _Compare>
_LIBCUDACXX_HIDE_FROM_ABI void
nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth, _RandomAccessIterator __last, _Compare __comp)
{
  using _Comp_ref = __comp_ref_type<_Compare>;
  __nth_element<_Comp_ref>(__first, __nth, __last, __comp);
}

template <class _RandomAccessIterator>
_LIBCUDACXX_HIDE_FROM_ABI void
nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth, _RandomAccessIterator __last)
{
  _CUDA_VSTD::nth_element(__first, __nth, __last, __less{});
}

#endif
_LIBCUDACXX_END_NAMESPACE_STD

#if defined(_LIBCUDACXX_HAS_PARALLEL_ALGORITHMS) && _CCCL_STD_VER >= 2017
#  include <__pstl_algorithm>
#endif

_CCCL_POP_MACROS

#endif // _LIBCUDACXX_ALGORITHM
