Is the poor performance of std::vector due to not calling realloc a logarithmic number of times?
1 2 3 4 5 6 7 8 9 10 11 | #include<vector> int main(int argc, char * argv[]) { const unsigned long long size = 1000000000; std::vector <int> b(size); for(int i = 0; i < size; i++) { b[i]=i; } return 0; } |
1 2 3 4 5 6 7 8 9 10 11 | #include<vector> int main(int argc, char * argv[]) { const int size = 1000000000; std::vector <int> b; for(int i = 0; i < size; i++) { b.push_back(i); } return 0; } |
1 2 3 4 5 6 7 8 9 10 11 12 | #include<vector> int main(int argc, char * argv[]) { const int size = 1000000000; std::vector <int> b; b.reserve(size); for(int i = 0; i < size; i++) { b.push_back(i); } return 0; } |
1 2 3 4 5 6 7 8 9 | int main(int argc, char * argv[]) { const int size = 1000000000; int * a = new int [size]; for(int i = 0; i < size; i++) { a[i] = i; } delete [] a; return 0; } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | #include<vector> int main(int argc, char * argv[]) { const unsigned long long size = 1000000000; int * a = (int *)malloc(size*sizeof(int)); int next_power = 1; for(int i = 0; i < size; i++) { a[i] = i; if(i == next_power - 1) { next_power *= 2; a=(int*)realloc(a,next_power*sizeof(int)); } } free(a); return 0; } |
1 2 | xorl %eax, %eax retq |
相比之下,编译器很难证明向量的分配是无用的。也许编译器开发人员可以教它识别未使用的std向量,就像他们识别未使用的raw c数组一样,但这种优化实际上是一个小问题,在我的经验中,它会导致许多分析问题。
这比std vector可以管理按位可复制类型更有效。我敢打赌,realloc版本实际上从不复制;总是有内存空间来增加向量(在真正的程序中,情况可能不是这样)。
when I add 1 billion integers and the second code is dramatically faster than the one using the vector
1 2 3 4 | std::vector<int> b(size); for(int i = 0; i < size; i++) { b[i] = i; } |
If this is what causes the slowness of the first code, how can I tell
std::vector to grow geometrically?
Is the poor performance of std::vector due to not calling realloc a logarithmic number of times?
<罢工>你的测试既不支持这个结论,也不能证明相反的结论。但是,我假设重新分配被称为线性次数,除非有相反的证据。< /打击>
I suspect that this is caused by the vector internally calling realloc too many times.
how can I tell std::vector to grow geometrically?
Note that calling reserve once would work in this case but not in the more general case in which the number of push_back is not known in advance.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | class my_vect { int *data; size_t current_used; size_t current_alloc; public: my_vect() : data(nullptr) , current_used(0) , current_alloc(0) {} void push_back(int val) { if (nullptr == data) { data = new int[1]; current_alloc = 1; } else if (current_used == current_alloc) { int *temp = new int[current_alloc * 2]; for (size_t i=0; i<current_used; i++) temp[i] = data[i]; swap(temp, data); delete [] temp; current_alloc *= 2; } data[current_used++] = val; } int &at(size_t index) { if (index >= current_used) throw bad_index(); return data[index]; } int &operator[](size_t index) { return data[index]; } ~my_vect() { delete [] data; } }; |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | int main() { std::locale out(""); std::cout.imbue(out); using namespace std::chrono; std::cout <<"my_vect "; for (int size = 100; size <= 1000000000; size *= 10) { auto start = high_resolution_clock::now(); my_vect b; for(int i = 0; i < size; i++) { b.push_back(i); } auto stop = high_resolution_clock::now(); std::cout <<"Size:" << std::setw(15) << size <<", Time:" << std::setw(15) << duration_cast<microseconds>(stop-start).count() <<" us "; } std::cout <<" std::vector "; for (int size = 100; size <= 1000000000; size *= 10) { auto start = high_resolution_clock::now(); std::vector<int> b; for (int i = 0; i < size; i++) { b.push_back(i); } auto stop = high_resolution_clock::now(); std::cout <<"Size:" << std::setw(15) << size <<", Time:" << std::setw(15) << duration_cast<microseconds>(stop - start).count() <<" us "; } } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | my_vect Size: 100, Time: 8 us Size: 1,000, Time: 23 us Size: 10,000, Time: 141 us Size: 100,000, Time: 950 us Size: 1,000,000, Time: 8,040 us Size: 10,000,000, Time: 51,404 us Size: 100,000,000, Time: 442,713 us Size: 1,000,000,000, Time: 7,936,013 us std::vector Size: 100, Time: 40 us Size: 1,000, Time: 4 us Size: 10,000, Time: 29 us Size: 100,000, Time: 426 us Size: 1,000,000, Time: 3,730 us Size: 10,000,000, Time: 41,294 us Size: 100,000,000, Time: 363,942 us Size: 1,000,000,000, Time: 5,200,545 us |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | // C++17 #include <benchmark/benchmark.h> // Googleo benchmark lib, for benchmark. #include <new> // For std::bad_alloc. #include <memory> // For std::allocator_traits, std::uninitialized_move. #include <cstdlib> // For C heap management API. #include <cstddef> // For std::size_t, std::max_align_t. #include <cassert> // For assert. #include <utility> // For std::forward, std::declval, namespace linux { #include <sys/mman.h> // For mmap, mremap, munmap. #include <errno.h> // For errno. auto get_errno() noexcept { return errno; } } /* * Allocators. * These allocators will have non-standard compliant behavior if the type T's cp ctor has side effect. */ // class mrealloc are usefull for allocating small space for // std::vector. // // Can prevent copy of data and memory fragmentation if there's enough // continous memory at the original place. template <class T> struct mrealloc { using pointer = T*; using value_type = T; auto allocate(std::size_t len) { if (auto ret = std::malloc(len)) return static_cast<pointer>(ret); else throw std::bad_alloc(); } auto reallocate(pointer old_ptr, std::size_t old_len, std::size_t len) { if (auto ret = std::realloc(old_ptr, len)) return static_cast<pointer>(ret); else throw std::bad_alloc(); } void deallocate(void *ptr, std::size_t len) noexcept { std::free(ptr); } }; // class mmaprealloc is suitable for large memory use. // // It will be usefull for situation that std::vector can grow to a huge // size. // // User can call reserve without worrying wasting a lot of memory. // // It can prevent data copy and memory fragmentation at any time. template <class T> struct mmaprealloc { using pointer = T*; using value_type = T; auto allocate(std::size_t len) const { return allocate_impl(len, MAP_PRIVATE | MAP_ANONYMOUS); } auto reallocate(pointer old_ptr, std::size_t old_len, std::size_t len) const { return reallocate_impl(old_ptr, old_len, len, MREMAP_MAYMOVE); } void deallocate(pointer ptr, std::size_t len) const noexcept { assert(linux::munmap(ptr, len) == 0); } protected: auto allocate_impl(std::size_t _len, int flags) const { if (auto ret = linux::mmap(nullptr, get_proper_size(_len), PROT_READ | PROT_WRITE, flags, -1, 0)) return static_cast<pointer>(ret); else fail(EAGAIN | ENOMEM); } auto reallocate_impl(pointer old_ptr, std::size_t old_len, std::size_t _len, int flags) const { if (auto ret = linux::mremap(old_ptr, old_len, get_proper_size(_len), flags)) return static_cast<pointer>(ret); else fail(EAGAIN | ENOMEM); } static inline constexpr const std::size_t magic_num = 4096 - 1; static inline auto get_proper_size(std::size_t len) noexcept -> std::size_t { return round_to_pagesize(len); } static inline auto round_to_pagesize(std::size_t len) noexcept -> std::size_t { return (len + magic_num) & ~magic_num; } static inline void fail(int assert_val) { auto _errno = linux::get_errno(); assert(_errno == assert_val); throw std::bad_alloc(); } }; template <class T> struct mmaprealloc_populate_ver: mmaprealloc<T> { auto allocate(size_t len) const { return mmaprealloc<T>::allocate_impl(len, MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE); } }; namespace impl { struct disambiguation_t2 {}; struct disambiguation_t1 { constexpr operator disambiguation_t2() const noexcept { return {}; } }; template <class Alloc> static constexpr auto has_reallocate(disambiguation_t1) noexcept -> decltype(&Alloc::reallocate, bool{}) { return true; } template <class Alloc> static constexpr bool has_reallocate(disambiguation_t2) noexcept { return false; } template <class Alloc> static inline constexpr const bool has_reallocate_v = has_reallocate<Alloc>(disambiguation_t1{}); } /* impl */ template <class Alloc> struct allocator_traits: public std::allocator_traits<Alloc> { using Base = std::allocator_traits<Alloc>; using value_type = typename Base::value_type; using pointer = typename Base::pointer; using size_t = typename Base::size_type; static auto reallocate(Alloc &alloc, pointer prev_ptr, size_t prev_len, size_t new_len) { if constexpr(impl::has_reallocate_v<Alloc>) return alloc.reallocate(prev_ptr, prev_len, new_len); else { auto new_ptr = Base::allocate(alloc, new_len); // Move existing array for(auto _prev_ptr = prev_ptr, _new_ptr = new_ptr; _prev_ptr != prev_ptr + prev_len; ++_prev_ptr, ++_new_ptr) { new (_new_ptr) value_type(std::move(*_prev_ptr)); _new_ptr->~value_type(); } Base::deallocate(alloc, prev_ptr, prev_len); return new_ptr; } } }; template <class T, class Alloc = std::allocator<T>> struct vector: protected Alloc { using alloc_traits = allocator_traits<Alloc>; using pointer = typename alloc_traits::pointer; using size_t = typename alloc_traits::size_type; pointer ptr = nullptr; size_t last = 0; size_t avail = 0; ~vector() noexcept { alloc_traits::deallocate(*this, ptr, avail); } template <class ...Args> void emplace_back(Args &&...args) { if (last == avail) double_the_size(); alloc_traits::construct(*this, &ptr[last++], std::forward<Args>(args)...); } void double_the_size() { if (__builtin_expect(!!(avail), true)) { avail <<= 1; ptr = alloc_traits::reallocate(*this, ptr, last, avail); } else { avail = 1 << 4; ptr = alloc_traits::allocate(*this, avail); } } }; template <class T> static void BM_vector(benchmark::State &state) { for(auto _: state) { T c; for(auto i = state.range(0); --i >= 0; ) c.emplace_back((char)i); } } static constexpr const auto one_GB = 1 << 30; BENCHMARK_TEMPLATE(BM_vector, vector<char>) ->Range(1 << 3, one_GB); BENCHMARK_TEMPLATE(BM_vector, vector<char, mrealloc<char>>) ->Range(1 << 3, one_GB); BENCHMARK_TEMPLATE(BM_vector, vector<char, mmaprealloc<char>>) ->Range(1 << 3, one_GB); BENCHMARK_TEMPLATE(BM_vector, vector<char, mmaprealloc_populate_ver<char>>)->Range(1 << 3, one_GB); BENCHMARK_MAIN(); |
1 | Debian 9.4, Linux version 4.9.0-6-amd64 ( version 6.3.0 20170516 (Debian 6.3.0-18+deb9u1) ) #1 SMP Debian 4.9.82-1+deb9u3 (2018-03-02) |
1 2 | sudo cpupower frequency-set --governor performance ./a.out |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | Run on (8 X 1600 MHz CPU s) CPU Caches: L1 Data 32K (x4) L1 Instruction 32K (x4) L2 Unified 256K (x4) L3 Unified 6144K (x1) ---------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------------------------------------------------- BM_vector<vector<char>>/8 58 ns 58 ns 11476934 BM_vector<vector<char>>/64 324 ns 324 ns 2225396 BM_vector<vector<char>>/512 1527 ns 1527 ns 453629 BM_vector<vector<char>>/4096 7196 ns 7196 ns 96695 BM_vector<vector<char>>/32768 50145 ns 50140 ns 13655 BM_vector<vector<char>>/262144 549821 ns 549825 ns 1245 BM_vector<vector<char>>/2097152 5007342 ns 5006393 ns 146 BM_vector<vector<char>>/16777216 42873349 ns 42873462 ns 15 BM_vector<vector<char>>/134217728 336225619 ns 336097218 ns 2 BM_vector<vector<char>>/1073741824 2642934606 ns 2642803281 ns 1 BM_vector<vector<char, mrealloc<char>>>/8 55 ns 55 ns 12914365 BM_vector<vector<char, mrealloc<char>>>/64 266 ns 266 ns 2591225 BM_vector<vector<char, mrealloc<char>>>/512 1229 ns 1229 ns 567505 BM_vector<vector<char, mrealloc<char>>>/4096 6903 ns 6903 ns 102752 BM_vector<vector<char, mrealloc<char>>>/32768 48522 ns 48523 ns 14409 BM_vector<vector<char, mrealloc<char>>>/262144 399470 ns 399368 ns 1783 BM_vector<vector<char, mrealloc<char>>>/2097152 3048578 ns 3048619 ns 229 BM_vector<vector<char, mrealloc<char>>>/16777216 24426934 ns 24421774 ns 29 BM_vector<vector<char, mrealloc<char>>>/134217728 262355961 ns 262357084 ns 3 BM_vector<vector<char, mrealloc<char>>>/1073741824 2092577020 ns 2092317044 ns 1 BM_vector<vector<char, mmaprealloc<char>>>/8 4285 ns 4285 ns 161498 BM_vector<vector<char, mmaprealloc<char>>>/64 5485 ns 5485 ns 125375 BM_vector<vector<char, mmaprealloc<char>>>/512 8571 ns 8569 ns 80345 BM_vector<vector<char, mmaprealloc<char>>>/4096 24248 ns 24248 ns 28655 BM_vector<vector<char, mmaprealloc<char>>>/32768 165021 ns 165011 ns 4421 BM_vector<vector<char, mmaprealloc<char>>>/262144 1177041 ns 1177048 ns 557 BM_vector<vector<char, mmaprealloc<char>>>/2097152 9229860 ns 9230023 ns 74 BM_vector<vector<char, mmaprealloc<char>>>/16777216 75425704 ns 75426431 ns 9 BM_vector<vector<char, mmaprealloc<char>>>/134217728 607661012 ns 607662273 ns 1 BM_vector<vector<char, mmaprealloc<char>>>/1073741824 4871003928 ns 4870588050 ns 1 BM_vector<vector<char, mmaprealloc_populate_ver<char>>>/8 3956 ns 3956 ns 175037 BM_vector<vector<char, mmaprealloc_populate_ver<char>>>/64 5087 ns 5086 ns 133944 BM_vector<vector<char, mmaprealloc_populate_ver<char>>>/512 8662 ns 8662 ns 80579 BM_vector<vector<char, mmaprealloc_populate_ver<char>>>/4096 23883 ns 23883 ns 29265 BM_vector<vector<char, mmaprealloc_populate_ver<char>>>/32768 158374 ns 158376 ns 4444 BM_vector<vector<char, mmaprealloc_populate_ver<char>>>/262144 1171514 ns 1171522 ns 593 BM_vector<vector<char, mmaprealloc_populate_ver<char>>>/2097152 9297357 ns 9293770 ns 74 BM_vector<vector<char, mmaprealloc_populate_ver<char>>>/16777216 75140789 ns 75141057 ns 9 BM_vector<vector<char, mmaprealloc_populate_ver<char>>>/134217728 636359403 ns 636368640 ns 1 BM_vector<vector<char, mmaprealloc_populate_ver<char>>>/1073741824 4865103542 ns 4864582150 ns 1 |