Skip to content

Commit 4680152

Browse files
authored
Merge pull request #1464 from joto/prepare-for-new-ram-middle
Prepare for new ram middle
2 parents ae20974 + f3ac57e commit 4680152

File tree

11 files changed

+464
-0
lines changed

11 files changed

+464
-0
lines changed

src/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ set(osm2pgsql_lib_SOURCES
1414
node-persistent-cache.cpp
1515
node-ram-cache.cpp
1616
options.cpp
17+
ordered-index.cpp
1718
osmdata.cpp
1819
osmium-builder.cpp
1920
output-gazetteer.cpp

src/middle.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "thread-pool.hpp"
1919

2020
class options_t;
21+
struct output_requirements;
2122

2223
/**
2324
* Interface for returning information about raw OSM input data from a cache.
@@ -107,6 +108,8 @@ struct middle_t
107108
virtual idlist_t get_rels_by_way(osmid_t) { return {}; }
108109

109110
virtual std::shared_ptr<middle_query_t> get_query_instance() = 0;
111+
112+
virtual void set_requirements(output_requirements const &) {}
110113
};
111114

112115
inline middle_t::~middle_t() = default;

src/options.hpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,31 @@ struct database_options_t
4343
std::string conninfo() const;
4444
};
4545

46+
/**
47+
* Outputs can signal their requirements to the middle by setting these fields.
48+
*/
49+
struct output_requirements
50+
{
51+
/**
52+
* Need full node objects with tags, attributes (only if --extra-attributes
53+
* is set) and locations. If false, only node locations are needed.
54+
*/
55+
bool full_nodes = false;
56+
57+
/**
58+
* Need full way objects with tags, attributes (only if --extra-attributes
59+
* is set) and way nodes. If false, only way nodes are needed.
60+
*/
61+
bool full_ways = false;
62+
63+
/**
64+
* Need full relation objects with tags, attributes (only if
65+
* --extra-attributes is set) and members. If false, no data from relations
66+
* is needed.
67+
*/
68+
bool full_relations = false;
69+
};
70+
4671
/**
4772
* Structure for storing command-line and other options
4873
*/

src/ordered-index.cpp

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/**
2+
* SPDX-License-Identifier: GPL-2.0-or-later
3+
*
4+
* This file is part of osm2pgsql (https://osm2pgsql.org/).
5+
*
6+
* Copyright (C) 2006-2021 by the osm2pgsql developer community.
7+
* For a full list of authors see the git log.
8+
*/
9+
10+
#include "ordered-index.hpp"
11+
12+
#include <algorithm>
13+
#include <numeric>
14+
15+
std::size_t ordered_index_t::capacity() const noexcept
16+
{
17+
return std::accumulate(m_ranges.cbegin(), m_ranges.cend(), 0ULL,
18+
[](std::size_t sum, range_entry const &range) {
19+
return sum + range.index.capacity();
20+
});
21+
}
22+
23+
void ordered_index_t::add(osmid_t id, std::size_t offset)
24+
{
25+
assert(m_ranges.empty() ||
26+
(last().to < id &&
27+
(last().offset_from + last().index.back().offset) < offset));
28+
29+
if (need_new_2nd_level() ||
30+
(id - last().from) > std::numeric_limits<uint32_t>::max()) {
31+
if (!m_ranges.empty()) {
32+
m_ranges.back().to = id - 1;
33+
}
34+
m_ranges.emplace_back(id, offset, m_block_size);
35+
if (m_block_size < max_block_size) {
36+
m_block_size <<= 1U;
37+
}
38+
}
39+
40+
// Yes, the first second level block always contains {0, 0}. We
41+
// leave it that way to simplify the code.
42+
m_ranges.back().index.push_back(second_level_index_entry{
43+
static_cast<uint32_t>(id - last().from),
44+
static_cast<uint32_t>(offset - last().offset_from)});
45+
m_ranges.back().to = id;
46+
++m_size;
47+
}
48+
49+
std::pair<osmid_t, std::size_t> ordered_index_t::get_internal(osmid_t id) const
50+
noexcept
51+
{
52+
if (m_ranges.empty()) {
53+
return {0, not_found_value()};
54+
}
55+
56+
auto const rit = std::lower_bound(
57+
m_ranges.cbegin(), m_ranges.cend(), id,
58+
[](range_entry const &range, osmid_t id) { return range.to < id; });
59+
60+
if (rit == m_ranges.end()) {
61+
return {last().from + last().index.back().id,
62+
last().offset_from + last().index.back().offset};
63+
}
64+
65+
if (id < rit->from) {
66+
return {0, not_found_value()};
67+
}
68+
69+
auto it = std::upper_bound(
70+
rit->index.cbegin(), rit->index.cend(), id - rit->from,
71+
[](std::size_t id, second_level_index_entry const &idx) {
72+
return id < idx.id;
73+
});
74+
assert(it != rit->index.cbegin());
75+
--it;
76+
77+
return {rit->from + it->id, rit->offset_from + it->offset};
78+
}

src/ordered-index.hpp

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
#ifndef OSM2PSGQL_ORDERED_INDEX_HPP
2+
#define OSM2PSGQL_ORDERED_INDEX_HPP
3+
4+
/**
5+
* SPDX-License-Identifier: GPL-2.0-or-later
6+
*
7+
* This file is part of osm2pgsql (https://osm2pgsql.org/).
8+
*
9+
* Copyright (C) 2006-2021 by the osm2pgsql developer community.
10+
* For a full list of authors see the git log.
11+
*/
12+
13+
#include "osmtypes.hpp"
14+
15+
#include <cstddef>
16+
#include <cstdint>
17+
#include <limits>
18+
#include <utility>
19+
#include <vector>
20+
21+
/**
22+
* This class implements a memory-efficient ordered index for lookups from OSM
23+
* ids to an "offset" into some kind of primary datastore. Adding to the index
24+
* is amortized O(1), reading is O(1).
25+
*
26+
* Entries must always be added in order from lowest OSM id to highest OSM
27+
* id and lowest offset to highest offset, ie. both id and offset for each
28+
* entry must be strictly larger than the previous one. Entries can never
29+
* be changed.
30+
*
31+
* An index that is never used doesn't need more memory than
32+
* sizeof(ordered_index_t).
33+
*
34+
* All allocated memory can be freed by calling clear(). Afer that the index
35+
* can NOT be reused.
36+
*
37+
* There are two ways of accessing the data through the index:
38+
* * The get() method returns the offset for the specified id.
39+
* * The get_block() method returns the offset for the next smaller id, if the
40+
* id itself is not found.
41+
*
42+
* The implementation is in two levels, the second level blocks contain the id
43+
* and offset, the first level keeps track of second level blocks and the first
44+
* and last ids used in each block. There are two reasons for the choice of
45+
* this two-level design over a simpler vector-based design:
46+
*
47+
* * Vectors temporarily use a lot of memory when resizing. We can avoid this
48+
* by not resizing the second level blocks. We also save the memcpy needed
49+
* when resizing.
50+
* * To conserve memory, the id and offset in the second level blocks are 32
51+
* bit unsigned integers relative to the id and offset of the first id of a
52+
* block which is stored in the first level entry. Compared to the 64 bit
53+
* integers we would need without the two-level design, this halfs the
54+
* memory use.
55+
*/
56+
class ordered_index_t
57+
{
58+
public:
59+
/**
60+
* Constructor.
61+
*
62+
* \param initial_block_size Number of entries in the initial second level
63+
* index block. Subsequent blocks will each
64+
* double their size until max_block_size is
65+
* reached.
66+
*/
67+
explicit ordered_index_t(std::size_t initial_block_size = 1024 * 1024)
68+
: m_block_size(initial_block_size)
69+
{}
70+
71+
/**
72+
* This is the value returned from the getter functions if the id is not
73+
* in the database.
74+
*/
75+
static constexpr std::size_t not_found_value() noexcept
76+
{
77+
return std::numeric_limits<std::size_t>::max();
78+
}
79+
80+
/**
81+
* How many entries will fit into the currently allocated memory. This
82+
* is accurate for normal operations, but if there are huge gaps between
83+
* consecutive ids (> 2^32), less entries than this will fit.
84+
*/
85+
std::size_t capacity() const noexcept;
86+
87+
/// The number of entries in the index.
88+
std::size_t size() const noexcept { return m_size; }
89+
90+
/**
91+
* Add an entry to the index.
92+
*
93+
* \param id The key of the index.
94+
* \param offset The value of the index.
95+
*
96+
* \pre Id and offset must be larger than any previously added id and
97+
* offset, respectively.
98+
*/
99+
void add(osmid_t id, std::size_t offset);
100+
101+
/**
102+
* Get the offset for the specified id.
103+
*
104+
* If the id is not in the index \code not_found_value() \endcode is
105+
* returned.
106+
*
107+
* \param id The id to look for.
108+
*/
109+
std::size_t get(osmid_t id) const noexcept
110+
{
111+
auto const p = get_internal(id);
112+
if (p.first != id) {
113+
return not_found_value();
114+
}
115+
return p.second;
116+
}
117+
118+
/**
119+
* Get the offset for the specified id or, if the id is not in the index,
120+
* the next smaller id available in the index.
121+
*
122+
* If the id is not in the index and no smaller id is in the index,
123+
* \code not_found_value() \endcode is returned.
124+
*
125+
* \param id The id to look for.
126+
*/
127+
std::size_t get_block(osmid_t id) const noexcept
128+
{
129+
return get_internal(id).second;
130+
}
131+
132+
/**
133+
* The approximate amount of bytes currently allocated by this index.
134+
*/
135+
std::size_t used_memory() const noexcept
136+
{
137+
return m_ranges.capacity() * sizeof(range_entry) +
138+
capacity() * sizeof(second_level_index_entry);
139+
}
140+
141+
/**
142+
* Clear all memory used by this index. The index can NOT be reused after
143+
* that.
144+
*/
145+
void clear()
146+
{
147+
m_ranges.clear();
148+
m_ranges.shrink_to_fit();
149+
m_size = 0;
150+
}
151+
152+
private:
153+
struct second_level_index_entry
154+
{
155+
uint32_t id;
156+
uint32_t offset;
157+
};
158+
159+
struct range_entry
160+
{
161+
std::vector<second_level_index_entry> index;
162+
osmid_t from;
163+
osmid_t to = 0;
164+
std::size_t offset_from;
165+
166+
range_entry(osmid_t id, std::size_t offset, std::size_t block_size)
167+
: from(id), offset_from(offset)
168+
{
169+
index.reserve(block_size);
170+
}
171+
172+
bool full() const noexcept { return index.size() == index.capacity(); }
173+
};
174+
175+
range_entry const &last() const noexcept { return m_ranges.back(); }
176+
177+
bool need_new_2nd_level() const noexcept
178+
{
179+
return m_ranges.empty() || last().full();
180+
}
181+
182+
std::pair<osmid_t, std::size_t> get_internal(osmid_t id) const noexcept;
183+
184+
static constexpr std::size_t const max_block_size = 16 * 1024 * 1204;
185+
186+
std::vector<range_entry> m_ranges;
187+
std::size_t m_block_size;
188+
std::size_t m_size = 0;
189+
}; // class ordered_index_t
190+
191+
#endif // OSM2PSGQL_ORDERED_INDEX_HPP

src/osm2pgsql.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ static void run(options_t const &options)
3535
auto output =
3636
output_t::create_output(middle->get_query_instance(), options);
3737

38+
middle->set_requirements(output->get_requirements());
39+
3840
auto dependency_manager = std::unique_ptr<dependency_manager_t>(
3941
options.with_forward_dependencies
4042
? new full_dependency_manager_t{middle}

src/output-flex.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1402,6 +1402,13 @@ output_flex_t::output_flex_t(
14021402

14031403
if (!is_clone) {
14041404
init_lua(m_options.style);
1405+
1406+
// If the osm2pgsql.select_relation_members() Lua function is defined
1407+
// it means we need two-stage processing which in turn means we need
1408+
// the full ways stored in the middle.
1409+
if (m_select_relation_members) {
1410+
m_output_requirements.full_ways = true;
1411+
}
14051412
}
14061413

14071414
if (m_tables->empty()) {

src/output.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,15 @@ class output_t
7777

7878
virtual void merge_expire_trees(output_t *other);
7979

80+
struct output_requirements const &get_requirements() const noexcept
81+
{
82+
return m_output_requirements;
83+
}
84+
8085
protected:
8186
std::shared_ptr<middle_query_t> m_mid;
8287
const options_t m_options;
88+
output_requirements m_output_requirements{};
8389
};
8490

8591
#endif // OSM2PGSQL_OUTPUT_HPP

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ set_test(test-middle)
4848
set_test(test-options-database LABELS NoDB)
4949
set_test(test-options-parse LABELS NoDB)
5050
set_test(test-options-projection)
51+
set_test(test-ordered-index LABELS NoDB)
5152
set_test(test-output-gazetteer)
5253
set_test(test-output-pgsql)
5354
set_test(test-output-pgsql-area)

tests/common-import.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,8 @@ class import_t
148148
auto output =
149149
output_t::create_output(middle->get_query_instance(), options);
150150

151+
middle->set_requirements(output->get_requirements());
152+
151153
auto dependency_manager = std::unique_ptr<dependency_manager_t>(
152154
options.with_forward_dependencies
153155
? new full_dependency_manager_t{middle}
@@ -183,6 +185,8 @@ class import_t
183185
auto output =
184186
output_t::create_output(middle->get_query_instance(), options);
185187

188+
middle->set_requirements(output->get_requirements());
189+
186190
auto dependency_manager = std::unique_ptr<dependency_manager_t>(
187191
new full_dependency_manager_t{middle});
188192

0 commit comments

Comments
 (0)