Sam Afshari's Notes
  • GitHub
  • Twitter/X
  • All posts
  • Ed
  • NuGets
  • POIWorld
  • RedCorners

Extracting OSM POIs - Sun, Jun 1, 2025

The aim of these notes is to take the OSM planet PBF file, process it, and create an indexed SQLite database of all the POIs and tags.

During the process I will also demonstrate how to split the PBF file into chunks for parallel processing. WSL2 Ubuntu RAM disks will be used to speed up the processing. osm and osm.bz2 files are created for each chunk and all chunks merged as well.

Initially all the pre-processing is done on WSL2 Ubuntu with 128GB of RAM and 36 CPU cores.

First, the WSL2 virtual machine has to be configured to use more memory than its default configuration. To do this, create a file on the Windows user’s home directory named .wslconfig containing:

[wsl2]
memory=110GB
swap=110GB

Save and restart the VM by running wsl --shutdown. You can use htop to verify how much memory and swap WSL has after this modification.

Download the latest OSM planet PBF file (e.g. planet-250526.osm.pbf) and copy it into your working directory. In this example, the working directory is /mnt/f/osm_vm_data and the RAM disk is mounted at /mnt/ramdisk/.

Osmium and Parallel are needed for our processing scripts:

sudo apt install -y parallel
sudo apt install -y osmium-tool
sudo apt install -y osmosis

We run a script to create a RAM disk, split and create PBF files. Adjust paths and use RAM disk as desired:

#!/bin/bash
# Run on Ubuntu with osmium-tool, parallel, and Overpass API installed
set -x
# Configuration
INPUT_PBF="/mnt/ramdisk/planet-250526.osm.pbf"
CHUNKS=33  # Number of parallel chunks (adjust based on CPU cores)
RAMDISK_SIZE="110G"  # Adjust based on available RAM

# Check dependencies
command -v osmium >/dev/null 2>&1 || { echo "osmium-tool not installed. Run: sudo apt install osmium-tool"; exit 1; }
command -v parallel >/dev/null 2>&1 || { echo "parallel not installed. Run: sudo apt install parallel"; exit 1; }
command -v bzip2 >/dev/null 2>&1 || { echo "bzip2 not installed. Run: sudo apt install bzip2"; exit 1; }

# Create RAM disk for faster I/O
#sudo mkdir -p /mnt/ramdisk
#sudo mount -t tmpfs -o size="$RAMDISK_SIZE" tmpfs /mnt/ramdisk

# Copy input file to RAM disk
cp "/mnt/f/osm_vm_data/planet-250526.osm.pbf" /mnt/ramdisk/

seq 0 $((CHUNKS-1)) | parallel -j8 'min_lon=$(({}*360/'$CHUNKS'-180)); max_lon=$((({}+1)*360/'$CHUNKS'-180)); osmium extract -b "$min_lon,-90,$max_lon,90" "'$INPUT_PBF'" -o "/mnt/f/osm_vm_data/chunks/chunk{}.pbf"'

# Process chunks in parallel to extract all amenity nodes
ls /mnt/f/chunks/chunk*.pbf | parallel -j"$CHUNKS" osmium tags-filter {} n/amenity -o {.}.osm

# Merge results
osmium merge /mnt/ramdisk/chunk*.osm -o /mnt/ramdisk/amenities.osm

# Compress for Overpass
bzip2 /mnt/ramdisk/amenities.osm

This script can be adjusted based on needs to create chunks or wholes of PBF, OSM, bz2 files with only the POIs.

More POIs can be extracted by adding more types to the script. For example:

osmium tags-filter /mnt/ramdisk/planet-latest.osm.pbf n/amenity,shop,tourism,leisure,office,craft,emergency,highway=bus_stop -o /mnt/ramdisk/pois.pbf 

Build JSONs from OSM data

In this section, I will build a text file, new-line separated, where each line is a separate JSON file containing the data of a POI.

#include <osmium/io/any_input.hpp>
#include <osmium/handler.hpp>
#include <osmium/visitor.hpp>
#include <osmium/osm/node.hpp>
#include <osmium/thread/pool.hpp>
#include "nlohmann/json.hpp"
#include <fstream>
#include <vector>
#include <thread>
#include <mutex>
#include <queue>
#include <condition_variable>
#include <iostream>
#include <string>
#include <bzlib.h>
#include <stdexcept>

// Thread-safe queue for JSON lines
class ThreadSafeQueue {
    std::queue<std::string> queue_;
    std::mutex mutex_;
    std::condition_variable cond_;
    bool done_ = false;

public:
    void push(std::string item) {
        std::lock_guard<std::mutex> lock(mutex_);
        queue_.push(std::move(item));
        cond_.notify_one();
    }

    bool pop(std::string& item) {
        std::unique_lock<std::mutex> lock(mutex_);
        cond_.wait(lock, [this] { return !queue_.empty() || done_; });
        if (queue_.empty() && done_) return false;
        item = std::move(queue_.front());
        queue_.pop();
        return true;
    }

    void set_done() {
        std::lock_guard<std::mutex> lock(mutex_);
        done_ = true;
        cond_.notify_all();
    }
};

// Osmium handler to process nodes
struct AmenityHandler : public osmium::handler::Handler {
    ThreadSafeQueue& output_queue_;
    explicit AmenityHandler(ThreadSafeQueue& queue) : output_queue_(queue) {}

    void node(const osmium::Node& node) {
        if (node.tags().has_key("amenity")) {
            nlohmann::json j;
            j["id"] = node.id();
            j["lat"] = node.location().lat();
            j["lon"] = node.location().lon();
            j["tags"] = nlohmann::json::object();
            for (const auto& tag : node.tags()) {
                j["tags"][tag.key()] = tag.value();
            }
            output_queue_.push(j.dump());
        }
    }
};

// Writer thread function
void writer_thread(ThreadSafeQueue& queue, const std::string& output_file) {
    std::ofstream out(output_file, std::ios::out | std::ios::binary);
    if (!out.is_open()) {
        throw std::runtime_error("Cannot open output file: " + output_file);
    }
    std::string line;
    while (queue.pop(line)) {
        out << line << '\n';
    }
    out.close();
}

int main(int argc, char* argv[]) {
    if (argc != 3) {
        std::cerr << "Usage: " << argv[0] << " <input_file> <output_file>\n";
        return 1;
    }

    std::string input_file = argv[1];
    std::string output_file = argv[2];

    try {
        // Initialize thread-safe queue
        ThreadSafeQueue queue;

        // Start writer thread
        std::thread writer(writer_thread, std::ref(queue), output_file);

        // Set up Osmium reader
        osmium::io::File file(input_file);
        osmium::io::Reader reader(file, osmium::osm_entity_bits::node);

        // Set up handler and thread pool
        AmenityHandler handler(queue);
        osmium::thread::Pool pool(std::thread::hardware_concurrency());

        // Apply handler to OSM data
        osmium::apply(reader, handler);

        // Close reader
        reader.close();

        // Signal writer to finish
        queue.set_done();
        writer.join();
    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << '\n';
        return 1;
    }

    std::cout << "Output written to " << output_file << '\n';
    return 0;
}

To build this, we need libosmium, nlohmann/json and libbz2:

  • https://github.com/osmcode/libosmium
  • https://github.com/nlohmann/json
sudo apt update
sudo apt install libosmium2-dev libbz2-dev zlib1g-dev

Usage:

./osm_to_jsonl amenities.osm.bz2 amenities.jsonl

JSON to SQLite

We’ll use a C# program to build an indexed SQLite database file from the JSONs.

using System.Collections.Concurrent;
using System.Data.SQLite;
using System.Text;

using Newtonsoft.Json.Linq;

class Program
{
    private class PoiData
    {
        public long Id { get; set; }
        public double Lat { get; set; }
        public double Lon { get; set; }
        public string Name { get; set; }
        public string Wikidata { get; set; }
        public JObject Tags { get; set; }
        public string RawJson { get; set; }
        public int LatInt { get; set; }
        public int LonInt { get; set; }
    }

    private class SQLiteConnectionWrapper
    {
        private readonly SQLiteConnection _connection;
        private readonly object _lock = new object();

        public SQLiteConnectionWrapper(string dbPath)
        {
            _connection = new SQLiteConnection($"Data Source={dbPath};Version=3;");
            _connection.Open();
        }

        public void ExecuteNonQuery(string sql, params SQLiteParameter[] parameters)
        {
            lock (_lock)
            {
                using (var cmd = new SQLiteCommand(sql, _connection))
                {
                    if (parameters != null) cmd.Parameters.AddRange(parameters);
                    cmd.ExecuteNonQuery();
                }
            }
        }

        public long ExecuteScalar(string sql, params SQLiteParameter[] parameters)
        {
            lock (_lock)
            {
                using (var cmd = new SQLiteCommand(sql, _connection))
                {
                    if (parameters != null) cmd.Parameters.AddRange(parameters);
                    return Convert.ToInt64(cmd.ExecuteScalar());
                }
            }
        }

        public void Vacuum()
        {
            ExecuteNonQuery("VACUUM");
        }

        public void Close()
        {
            lock (_lock)
            {
                _connection.Close();
            }
        }
    }

    static async Task Main(string[] args)
    {
        string inputFile = args.Length > 0 ? args[0] : "r:/amenities.jsonl";
        string dbFile = args.Length > 1 ? args[1] : "r:/amenities.db";
        const int batchSize = 100000;

        try
        {
            // Initialize SQLite database
            if (File.Exists(dbFile)) File.Delete(dbFile);
            var db = new SQLiteConnectionWrapper(dbFile);

            // Optimize SQLite settings
            db.ExecuteNonQuery("PRAGMA cache_size = -2000000;"); // 2GB cache
            db.ExecuteNonQuery("PRAGMA temp_store_directory = 'r:/';");

            // Create tables
            db.ExecuteNonQuery(@"
                CREATE TABLE pois (
                    id INTEGER PRIMARY KEY,
                    lat REAL NOT NULL,
                    lon REAL NOT NULL,
                    name TEXT,
                    lat_int INTEGER NOT NULL,
                    lon_int INTEGER NOT NULL,
                    wikidata TEXT,
                    raw_json TEXT
                );

                CREATE TABLE tags (
                    tag_id INTEGER PRIMARY KEY AUTOINCREMENT,
                    key TEXT NOT NULL,
                    value TEXT NOT NULL
                );

                CREATE TABLE poi_tags (
                    poi_id INTEGER NOT NULL,
                    tag_id INTEGER NOT NULL,
                    PRIMARY KEY (poi_id, tag_id),
                    FOREIGN KEY (poi_id) REFERENCES pois(id),
                    FOREIGN KEY (tag_id) REFERENCES tags(tag_id)
                );
            ");

            // Deduplicate tags
            var tagCache = new ConcurrentDictionary<(string key, string value), long>();

            // Process JSON Lines in parallel with partitioning
            await Task.Run(() =>
            {
                var lines = File.ReadLines(inputFile);
                Parallel.ForEach(Partitioner.Create(0, lines.LongCount(), batchSize), range =>
                {
                    var batch = lines.Skip((int)range.Item1).Take((int)(range.Item2 - range.Item1)).Select(line =>
                    {
                        try
                        {
                            var json = JObject.Parse(line);
                            return new PoiData
                            {
                                Id = json["id"].Value<long>(),
                                Lat = json["lat"].Value<double>(),
                                Lon = json["lon"].Value<double>(),
                                Tags = json["tags"].Value<JObject>(),
                                RawJson = line,
                                LatInt = (int)(json["lat"].Value<double>() * 100000),
                                LonInt = (int)(json["lon"].Value<double>() * 100000),
                                Name = json["tags"]?["name"]?.Value<string>(),
                                Wikidata = json["tags"]?["wikidata"]?.Value<string>()
                            };
                        }
                        catch (Exception ex)
                        {
                            Console.WriteLine($"Error parsing line: {ex.Message}");
                            return null;
                        }
                    }).Where(poi => poi != null).ToList();

                    if (batch.Count > 0)
                    {
                        ProcessBatch(db, batch, tagCache);
                    }

                    Console.WriteLine($"Processed batch from {range.Item1} to {range.Item2} with {batch.Count} POIs.");
                });
            });

            Console.WriteLine("Building indexes...");

            // Create indexes in a transaction
            db.ExecuteNonQuery("BEGIN TRANSACTION;");
            db.ExecuteNonQuery(@"
                CREATE INDEX idx_pois_name ON pois(name);
                CREATE INDEX idx_pois_lat_int ON pois(lat_int);
                CREATE INDEX idx_pois_lon_int ON pois(lon_int);
                CREATE INDEX idx_pois_wikidata ON pois(wikidata);
                CREATE INDEX idx_tags_key ON tags(key);
                CREATE INDEX idx_tags_value ON tags(value);
                CREATE INDEX idx_poi_tags_poi_id ON poi_tags(poi_id);
                CREATE INDEX idx_poi_tags_tag_id ON poi_tags(tag_id);
            ");
            db.ExecuteNonQuery("COMMIT;");

            Console.WriteLine("Indexes created.");

            // Compact the database
            db.Vacuum();
            db.Close();

            Console.WriteLine($"Database created at {dbFile}");
        }
        catch (Exception ex)
        {
            Console.WriteLine($"Error: {ex.Message}");
        }
    }

    static void ProcessBatch(SQLiteConnectionWrapper db, List<PoiData> batch, ConcurrentDictionary<(string key, string value), long> tagCache)
    {
        // Batch INSERT for pois
        var poiSql = new StringBuilder("INSERT INTO pois (id, lat, lon, name, lat_int, lon_int, wikidata, raw_json) VALUES ");
        var poiParams = new List<SQLiteParameter>();
        for (int i = 0; i < batch.Count; i++)
        {
            var poi = batch[i];
            poiSql.Append($"(@id{i}, @lat{i}, @lon{i}, @name{i}, @lat_int{i}, @lon_int{i}, @wikidata{i}, @raw_json{i})");
            if (i < batch.Count - 1) poiSql.Append(",");
            poiParams.AddRange(new[]
            {
                new SQLiteParameter($"@id{i}", poi.Id),
                new SQLiteParameter($"@lat{i}", poi.Lat),
                new SQLiteParameter($"@lon{i}", poi.Lon),
                new SQLiteParameter($"@name{i}", poi.Name ?? (object)DBNull.Value),
                new SQLiteParameter($"@lat_int{i}", poi.LatInt),
                new SQLiteParameter($"@lon_int{i}", poi.LonInt),
                new SQLiteParameter($"@wikidata{i}", poi.Wikidata ?? (object)DBNull.Value),
                new SQLiteParameter($"@raw_json{i}", poi.RawJson)
            });
        }

        if (batch.Count > 0)
        {
            db.ExecuteNonQuery(poiSql.ToString(), poiParams.ToArray());
        }

        // Batch INSERT for tags
        var tagSql = new StringBuilder("INSERT OR IGNORE INTO tags (key, value) VALUES ");
        var tagParams = new List<SQLiteParameter>();
        var uniqueTags = new HashSet<(string key, string value)>();
        int tagIndex = 0;

        foreach (var poi in batch)
        {
            foreach (var tag in poi.Tags)
            {
                var keyValue = (key: tag.Key, value: tag.Value.ToString());
                if (uniqueTags.Add(keyValue))
                {
                    tagSql.Append($"(@key{tagIndex}, @value{tagIndex})");
                    tagSql.Append(",");
                    tagParams.AddRange(new[]
                    {
                        new SQLiteParameter($"@key{tagIndex}", keyValue.key),
                        new SQLiteParameter($"@value{tagIndex}", keyValue.value)
                    });
                    tagIndex++;
                }
            }
        }

        if (uniqueTags.Count > 0)
        {
            tagSql.Length--; // Remove trailing comma
            db.ExecuteNonQuery(tagSql.ToString(), tagParams.ToArray());

            // Update tagCache
            foreach (var kv in uniqueTags)
            {
                tagCache.GetOrAdd(kv, _ =>
                {
                    return db.ExecuteScalar(
                        "SELECT tag_id FROM tags WHERE key = @key AND value = @value",
                        new SQLiteParameter("@key", kv.key),
                        new SQLiteParameter("@value", kv.value)
                    );
                });
            }
        }

        // Batch INSERT for poi_tags
        var poiTagSql = new StringBuilder("INSERT OR IGNORE INTO poi_tags (poi_id, tag_id) VALUES ");
        var poiTagParams = new List<SQLiteParameter>();
        int poiTagIndex = 0;

        foreach (var poi in batch)
        {
            foreach (var tag in poi.Tags)
            {
                var keyValue = (key: tag.Key, value: tag.Value.ToString());
                if (tagCache.TryGetValue(keyValue, out var tagId))
                {
                    poiTagSql.Append($"(@poi_id{poiTagIndex}, @tag_id{poiTagIndex})");
                    poiTagSql.Append(",");
                    poiTagParams.AddRange(new[]
                    {
                        new SQLiteParameter($"@poi_id{poiTagIndex}", poi.Id),
                        new SQLiteParameter($"@tag_id{poiTagIndex}", tagId)
                    });
                    poiTagIndex++;
                }
            }
        }

        if (poiTagIndex > 0)
        {
            poiTagSql.Length--; // Remove trailing comma
            db.ExecuteNonQuery(poiTagSql.ToString(), poiTagParams.ToArray());
        }
    }
}

The free ImDisk utility for Windows allows RAM drives to be created, which is helpful to speed up the process.

Back to Home


© Sam Afshari 2024