Add new ordered_index_t class

joto · joto · commit f3ac57e2022f · 2021-04-26T12:01:13.000+02:00
This will be used in the new ram middle for efficiently storing
node locations, way node lists, tags etc.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -14,6 +14,7 @@ set(osm2pgsql_lib_SOURCES
   node-persistent-cache.cpp
   node-ram-cache.cpp
   options.cpp
+  ordered-index.cpp
   osmdata.cpp
   osmium-builder.cpp
   output-gazetteer.cpp
diff --git a/src/ordered-index.cpp b/src/ordered-index.cpp
@@ -0,0 +1,78 @@
+/**
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * This file is part of osm2pgsql (https://osm2pgsql.org/).
+ *
+ * Copyright (C) 2006-2021 by the osm2pgsql developer community.
+ * For a full list of authors see the git log.
+ */
+
+#include "ordered-index.hpp"
+
+#include <algorithm>
+#include <numeric>
+
+std::size_t ordered_index_t::capacity() const noexcept
+{
+    return std::accumulate(m_ranges.cbegin(), m_ranges.cend(), 0ULL,
+                           [](std::size_t sum, range_entry const &range) {
+                               return sum + range.index.capacity();
+                           });
+}
+
+void ordered_index_t::add(osmid_t id, std::size_t offset)
+{
+    assert(m_ranges.empty() ||
+           (last().to < id &&
+            (last().offset_from + last().index.back().offset) < offset));
+
+    if (need_new_2nd_level() ||
+        (id - last().from) > std::numeric_limits<uint32_t>::max()) {
+        if (!m_ranges.empty()) {
+            m_ranges.back().to = id - 1;
+        }
+        m_ranges.emplace_back(id, offset, m_block_size);
+        if (m_block_size < max_block_size) {
+            m_block_size <<= 1U;
+        }
+    }
+
+    // Yes, the first second level block always contains {0, 0}. We
+    // leave it that way to simplify the code.
+    m_ranges.back().index.push_back(second_level_index_entry{
+        static_cast<uint32_t>(id - last().from),
+        static_cast<uint32_t>(offset - last().offset_from)});
+    m_ranges.back().to = id;
+    ++m_size;
+}
+
+std::pair<osmid_t, std::size_t> ordered_index_t::get_internal(osmid_t id) const
+    noexcept
+{
+    if (m_ranges.empty()) {
+        return {0, not_found_value()};
+    }
+
+    auto const rit = std::lower_bound(
+        m_ranges.cbegin(), m_ranges.cend(), id,
+        [](range_entry const &range, osmid_t id) { return range.to < id; });
+
+    if (rit == m_ranges.end()) {
+        return {last().from + last().index.back().id,
+                last().offset_from + last().index.back().offset};
+    }
+
+    if (id < rit->from) {
+        return {0, not_found_value()};
+    }
+
+    auto it = std::upper_bound(
+        rit->index.cbegin(), rit->index.cend(), id - rit->from,
+        [](std::size_t id, second_level_index_entry const &idx) {
+            return id < idx.id;
+        });
+    assert(it != rit->index.cbegin());
+    --it;
+
+    return {rit->from + it->id, rit->offset_from + it->offset};
+}
diff --git a/src/ordered-index.hpp b/src/ordered-index.hpp
@@ -0,0 +1,191 @@
+#ifndef OSM2PSGQL_ORDERED_INDEX_HPP
+#define OSM2PSGQL_ORDERED_INDEX_HPP
+
+/**
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * This file is part of osm2pgsql (https://osm2pgsql.org/).
+ *
+ * Copyright (C) 2006-2021 by the osm2pgsql developer community.
+ * For a full list of authors see the git log.
+ */
+
+#include "osmtypes.hpp"
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <utility>
+#include <vector>
+
+/**
+ * This class implements a memory-efficient ordered index for lookups from OSM
+ * ids to an "offset" into some kind of primary datastore. Adding to the index
+ * is amortized O(1), reading is O(1).
+ *
+ * Entries must always be added in order from lowest OSM id to highest OSM
+ * id and lowest offset to highest offset, ie. both id and offset for each
+ * entry must be strictly larger than the previous one. Entries can never
+ * be changed.
+ *
+ * An index that is never used doesn't need more memory than
+ * sizeof(ordered_index_t).
+ *
+ * All allocated memory can be freed by calling clear(). Afer that the index
+ * can NOT be reused.
+ *
+ * There are two ways of accessing the data through the index:
+ * * The get() method returns the offset for the specified id.
+ * * The get_block() method returns the offset for the next smaller id, if the
+ *   id itself is not found.
+ *
+ * The implementation is in two levels, the second level blocks contain the id
+ * and offset, the first level keeps track of second level blocks and the first
+ * and last ids used in each block. There are two reasons for the choice of
+ * this two-level design over a simpler vector-based design:
+ *
+ * * Vectors temporarily use a lot of memory when resizing. We can avoid this
+ *   by not resizing the second level blocks. We also save the memcpy needed
+ *   when resizing.
+ * * To conserve memory, the id and offset in the second level blocks are 32
+ *   bit unsigned integers relative to the id and offset of the first id of a
+ *   block which is stored in the first level entry. Compared to the 64 bit
+ *   integers we would need without the two-level design, this halfs the
+ *   memory use.
+ */
+class ordered_index_t
+{
+public:
+    /**
+     * Constructor.
+     *
+     * \param initial_block_size Number of entries in the initial second level
+     *                           index block. Subsequent blocks will each
+     *                           double their size until max_block_size is
+     *                           reached.
+     */
+    explicit ordered_index_t(std::size_t initial_block_size = 1024 * 1024)
+    : m_block_size(initial_block_size)
+    {}
+
+    /**
+     * This is the value returned from the getter functions if the id is not
+     * in the database.
+     */
+    static constexpr std::size_t not_found_value() noexcept
+    {
+        return std::numeric_limits<std::size_t>::max();
+    }
+
+    /**
+     * How many entries will fit into the currently allocated memory. This
+     * is accurate for normal operations, but if there are huge gaps between
+     * consecutive ids (> 2^32), less entries than this will fit.
+     */
+    std::size_t capacity() const noexcept;
+
+    /// The number of entries in the index.
+    std::size_t size() const noexcept { return m_size; }
+
+    /**
+     * Add an entry to the index.
+     *
+     * \param id The key of the index.
+     * \param offset The value of the index.
+     *
+     * \pre Id and offset must be larger than any previously added id and
+     *      offset, respectively.
+     */
+    void add(osmid_t id, std::size_t offset);
+
+    /**
+     * Get the offset for the specified id.
+     *
+     * If the id is not in the index \code not_found_value() \endcode is
+     * returned.
+     *
+     * \param id The id to look for.
+     */
+    std::size_t get(osmid_t id) const noexcept
+    {
+        auto const p = get_internal(id);
+        if (p.first != id) {
+            return not_found_value();
+        }
+        return p.second;
+    }
+
+    /**
+     * Get the offset for the specified id or, if the id is not in the index,
+     * the next smaller id available in the index.
+     *
+     * If the id is not in the index and no smaller id is in the index,
+     * \code not_found_value() \endcode is returned.
+     *
+     * \param id The id to look for.
+     */
+    std::size_t get_block(osmid_t id) const noexcept
+    {
+        return get_internal(id).second;
+    }
+
+    /**
+     * The approximate amount of bytes currently allocated by this index.
+     */
+    std::size_t used_memory() const noexcept
+    {
+        return m_ranges.capacity() * sizeof(range_entry) +
+               capacity() * sizeof(second_level_index_entry);
+    }
+
+    /**
+     * Clear all memory used by this index. The index can NOT be reused after
+     * that.
+     */
+    void clear()
+    {
+        m_ranges.clear();
+        m_ranges.shrink_to_fit();
+        m_size = 0;
+    }
+
+private:
+    struct second_level_index_entry
+    {
+        uint32_t id;
+        uint32_t offset;
+    };
+
+    struct range_entry
+    {
+        std::vector<second_level_index_entry> index;
+        osmid_t from;
+        osmid_t to = 0;
+        std::size_t offset_from;
+
+        range_entry(osmid_t id, std::size_t offset, std::size_t block_size)
+        : from(id), offset_from(offset)
+        {
+            index.reserve(block_size);
+        }
+
+        bool full() const noexcept { return index.size() == index.capacity(); }
+    };
+
+    range_entry const &last() const noexcept { return m_ranges.back(); }
+
+    bool need_new_2nd_level() const noexcept
+    {
+        return m_ranges.empty() || last().full();
+    }
+
+    std::pair<osmid_t, std::size_t> get_internal(osmid_t id) const noexcept;
+
+    static constexpr std::size_t const max_block_size = 16 * 1024 * 1204;
+
+    std::vector<range_entry> m_ranges;
+    std::size_t m_block_size;
+    std::size_t m_size = 0;
+}; // class ordered_index_t
+
+#endif // OSM2PSGQL_ORDERED_INDEX_HPP
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -48,6 +48,7 @@ set_test(test-middle)
 set_test(test-options-database LABELS NoDB)
 set_test(test-options-parse LABELS NoDB)
 set_test(test-options-projection)
+set_test(test-ordered-index LABELS NoDB)
 set_test(test-output-gazetteer)
 set_test(test-output-pgsql)
 set_test(test-output-pgsql-area)
diff --git a/tests/test-ordered-index.cpp b/tests/test-ordered-index.cpp