Skip to content

Commit f3ac57e

Browse files
committed
Add new ordered_index_t class
This will be used in the new ram middle for efficiently storing node locations, way node lists, tags etc.
1 parent 0e2e9d6 commit f3ac57e

File tree

5 files changed

+417
-0
lines changed

5 files changed

+417
-0
lines changed

src/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ set(osm2pgsql_lib_SOURCES
1414
node-persistent-cache.cpp
1515
node-ram-cache.cpp
1616
options.cpp
17+
ordered-index.cpp
1718
osmdata.cpp
1819
osmium-builder.cpp
1920
output-gazetteer.cpp

src/ordered-index.cpp

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/**
2+
* SPDX-License-Identifier: GPL-2.0-or-later
3+
*
4+
* This file is part of osm2pgsql (https://osm2pgsql.org/).
5+
*
6+
* Copyright (C) 2006-2021 by the osm2pgsql developer community.
7+
* For a full list of authors see the git log.
8+
*/
9+
10+
#include "ordered-index.hpp"
11+
12+
#include <algorithm>
13+
#include <numeric>
14+
15+
std::size_t ordered_index_t::capacity() const noexcept
16+
{
17+
return std::accumulate(m_ranges.cbegin(), m_ranges.cend(), 0ULL,
18+
[](std::size_t sum, range_entry const &range) {
19+
return sum + range.index.capacity();
20+
});
21+
}
22+
23+
void ordered_index_t::add(osmid_t id, std::size_t offset)
24+
{
25+
assert(m_ranges.empty() ||
26+
(last().to < id &&
27+
(last().offset_from + last().index.back().offset) < offset));
28+
29+
if (need_new_2nd_level() ||
30+
(id - last().from) > std::numeric_limits<uint32_t>::max()) {
31+
if (!m_ranges.empty()) {
32+
m_ranges.back().to = id - 1;
33+
}
34+
m_ranges.emplace_back(id, offset, m_block_size);
35+
if (m_block_size < max_block_size) {
36+
m_block_size <<= 1U;
37+
}
38+
}
39+
40+
// Yes, the first second level block always contains {0, 0}. We
41+
// leave it that way to simplify the code.
42+
m_ranges.back().index.push_back(second_level_index_entry{
43+
static_cast<uint32_t>(id - last().from),
44+
static_cast<uint32_t>(offset - last().offset_from)});
45+
m_ranges.back().to = id;
46+
++m_size;
47+
}
48+
49+
std::pair<osmid_t, std::size_t> ordered_index_t::get_internal(osmid_t id) const
50+
noexcept
51+
{
52+
if (m_ranges.empty()) {
53+
return {0, not_found_value()};
54+
}
55+
56+
auto const rit = std::lower_bound(
57+
m_ranges.cbegin(), m_ranges.cend(), id,
58+
[](range_entry const &range, osmid_t id) { return range.to < id; });
59+
60+
if (rit == m_ranges.end()) {
61+
return {last().from + last().index.back().id,
62+
last().offset_from + last().index.back().offset};
63+
}
64+
65+
if (id < rit->from) {
66+
return {0, not_found_value()};
67+
}
68+
69+
auto it = std::upper_bound(
70+
rit->index.cbegin(), rit->index.cend(), id - rit->from,
71+
[](std::size_t id, second_level_index_entry const &idx) {
72+
return id < idx.id;
73+
});
74+
assert(it != rit->index.cbegin());
75+
--it;
76+
77+
return {rit->from + it->id, rit->offset_from + it->offset};
78+
}

src/ordered-index.hpp

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
#ifndef OSM2PSGQL_ORDERED_INDEX_HPP
2+
#define OSM2PSGQL_ORDERED_INDEX_HPP
3+
4+
/**
5+
* SPDX-License-Identifier: GPL-2.0-or-later
6+
*
7+
* This file is part of osm2pgsql (https://osm2pgsql.org/).
8+
*
9+
* Copyright (C) 2006-2021 by the osm2pgsql developer community.
10+
* For a full list of authors see the git log.
11+
*/
12+
13+
#include "osmtypes.hpp"
14+
15+
#include <cstddef>
16+
#include <cstdint>
17+
#include <limits>
18+
#include <utility>
19+
#include <vector>
20+
21+
/**
22+
* This class implements a memory-efficient ordered index for lookups from OSM
23+
* ids to an "offset" into some kind of primary datastore. Adding to the index
24+
* is amortized O(1), reading is O(1).
25+
*
26+
* Entries must always be added in order from lowest OSM id to highest OSM
27+
* id and lowest offset to highest offset, ie. both id and offset for each
28+
* entry must be strictly larger than the previous one. Entries can never
29+
* be changed.
30+
*
31+
* An index that is never used doesn't need more memory than
32+
* sizeof(ordered_index_t).
33+
*
34+
* All allocated memory can be freed by calling clear(). Afer that the index
35+
* can NOT be reused.
36+
*
37+
* There are two ways of accessing the data through the index:
38+
* * The get() method returns the offset for the specified id.
39+
* * The get_block() method returns the offset for the next smaller id, if the
40+
* id itself is not found.
41+
*
42+
* The implementation is in two levels, the second level blocks contain the id
43+
* and offset, the first level keeps track of second level blocks and the first
44+
* and last ids used in each block. There are two reasons for the choice of
45+
* this two-level design over a simpler vector-based design:
46+
*
47+
* * Vectors temporarily use a lot of memory when resizing. We can avoid this
48+
* by not resizing the second level blocks. We also save the memcpy needed
49+
* when resizing.
50+
* * To conserve memory, the id and offset in the second level blocks are 32
51+
* bit unsigned integers relative to the id and offset of the first id of a
52+
* block which is stored in the first level entry. Compared to the 64 bit
53+
* integers we would need without the two-level design, this halfs the
54+
* memory use.
55+
*/
56+
class ordered_index_t
57+
{
58+
public:
59+
/**
60+
* Constructor.
61+
*
62+
* \param initial_block_size Number of entries in the initial second level
63+
* index block. Subsequent blocks will each
64+
* double their size until max_block_size is
65+
* reached.
66+
*/
67+
explicit ordered_index_t(std::size_t initial_block_size = 1024 * 1024)
68+
: m_block_size(initial_block_size)
69+
{}
70+
71+
/**
72+
* This is the value returned from the getter functions if the id is not
73+
* in the database.
74+
*/
75+
static constexpr std::size_t not_found_value() noexcept
76+
{
77+
return std::numeric_limits<std::size_t>::max();
78+
}
79+
80+
/**
81+
* How many entries will fit into the currently allocated memory. This
82+
* is accurate for normal operations, but if there are huge gaps between
83+
* consecutive ids (> 2^32), less entries than this will fit.
84+
*/
85+
std::size_t capacity() const noexcept;
86+
87+
/// The number of entries in the index.
88+
std::size_t size() const noexcept { return m_size; }
89+
90+
/**
91+
* Add an entry to the index.
92+
*
93+
* \param id The key of the index.
94+
* \param offset The value of the index.
95+
*
96+
* \pre Id and offset must be larger than any previously added id and
97+
* offset, respectively.
98+
*/
99+
void add(osmid_t id, std::size_t offset);
100+
101+
/**
102+
* Get the offset for the specified id.
103+
*
104+
* If the id is not in the index \code not_found_value() \endcode is
105+
* returned.
106+
*
107+
* \param id The id to look for.
108+
*/
109+
std::size_t get(osmid_t id) const noexcept
110+
{
111+
auto const p = get_internal(id);
112+
if (p.first != id) {
113+
return not_found_value();
114+
}
115+
return p.second;
116+
}
117+
118+
/**
119+
* Get the offset for the specified id or, if the id is not in the index,
120+
* the next smaller id available in the index.
121+
*
122+
* If the id is not in the index and no smaller id is in the index,
123+
* \code not_found_value() \endcode is returned.
124+
*
125+
* \param id The id to look for.
126+
*/
127+
std::size_t get_block(osmid_t id) const noexcept
128+
{
129+
return get_internal(id).second;
130+
}
131+
132+
/**
133+
* The approximate amount of bytes currently allocated by this index.
134+
*/
135+
std::size_t used_memory() const noexcept
136+
{
137+
return m_ranges.capacity() * sizeof(range_entry) +
138+
capacity() * sizeof(second_level_index_entry);
139+
}
140+
141+
/**
142+
* Clear all memory used by this index. The index can NOT be reused after
143+
* that.
144+
*/
145+
void clear()
146+
{
147+
m_ranges.clear();
148+
m_ranges.shrink_to_fit();
149+
m_size = 0;
150+
}
151+
152+
private:
153+
struct second_level_index_entry
154+
{
155+
uint32_t id;
156+
uint32_t offset;
157+
};
158+
159+
struct range_entry
160+
{
161+
std::vector<second_level_index_entry> index;
162+
osmid_t from;
163+
osmid_t to = 0;
164+
std::size_t offset_from;
165+
166+
range_entry(osmid_t id, std::size_t offset, std::size_t block_size)
167+
: from(id), offset_from(offset)
168+
{
169+
index.reserve(block_size);
170+
}
171+
172+
bool full() const noexcept { return index.size() == index.capacity(); }
173+
};
174+
175+
range_entry const &last() const noexcept { return m_ranges.back(); }
176+
177+
bool need_new_2nd_level() const noexcept
178+
{
179+
return m_ranges.empty() || last().full();
180+
}
181+
182+
std::pair<osmid_t, std::size_t> get_internal(osmid_t id) const noexcept;
183+
184+
static constexpr std::size_t const max_block_size = 16 * 1024 * 1204;
185+
186+
std::vector<range_entry> m_ranges;
187+
std::size_t m_block_size;
188+
std::size_t m_size = 0;
189+
}; // class ordered_index_t
190+
191+
#endif // OSM2PSGQL_ORDERED_INDEX_HPP

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ set_test(test-middle)
4848
set_test(test-options-database LABELS NoDB)
4949
set_test(test-options-parse LABELS NoDB)
5050
set_test(test-options-projection)
51+
set_test(test-ordered-index LABELS NoDB)
5152
set_test(test-output-gazetteer)
5253
set_test(test-output-pgsql)
5354
set_test(test-output-pgsql-area)

0 commit comments

Comments
 (0)