Merge branch 'overpass-take-2' into develop

# Conflicts:
#	scripts/upload_animal_shelters.py
This commit is contained in:
2025-06-20 08:08:42 +02:00

View File

@@ -4,17 +4,25 @@ import os
from types import SimpleNamespace from types import SimpleNamespace
import requests import requests
# TODO: consider using OSMPythonTools instead of requests or overpass library
from osmtogeojson import osmtogeojson
from tqdm import tqdm from tqdm import tqdm
DEFAULT_OSM_DATA_FILE = "export.geojson" DEFAULT_OSM_DATA_FILE = "export.geojson"
# Search area must be the official name, e.g. "Germany" is not a valid area name in Overpass API
# Consider instead finding & using the code within the query itself, e.g. "ISO3166-1"="DE"
DEFAULT_OVERPASS_SEARCH_AREA = "Deutschland"
def parse_args(): def parse_args():
"""Parse command-line arguments.""" """Parse command-line arguments."""
parser = argparse.ArgumentParser(description="Upload animal shelter data to the Notfellchen API.") parser = argparse.ArgumentParser(
description="Download animal shelter data from the Overpass API to the Notfellchen API.")
parser.add_argument("--api-token", type=str, help="API token for authentication.") parser.add_argument("--api-token", type=str, help="API token for authentication.")
parser.add_argument("--area", type=str, help="Area to search for animal shelters (default: Deutschland).")
parser.add_argument("--instance", type=str, help="API instance URL.") parser.add_argument("--instance", type=str, help="API instance URL.")
parser.add_argument("--data-file", type=str, help="Path to the GeoJSON file containing (only) animal shelters.") parser.add_argument("--data-file", type=str, help="Path to the GeoJSON file containing (only) animal shelters.")
parser.add_argument("--use-cached", action='store_true', help="Use the stored GeoJSON file")
return parser.parse_args() return parser.parse_args()
@@ -23,16 +31,26 @@ def get_config():
args = parse_args() args = parse_args()
api_token = args.api_token or os.getenv("NOTFELLCHEN_API_TOKEN") api_token = args.api_token or os.getenv("NOTFELLCHEN_API_TOKEN")
# TODO: document new environment variable NOTFELLCHEN_AREA
area = args.area or os.getenv("NOTFELLCHEN_AREA", DEFAULT_OVERPASS_SEARCH_AREA)
instance = args.instance or os.getenv("NOTFELLCHEN_INSTANCE") instance = args.instance or os.getenv("NOTFELLCHEN_INSTANCE")
data_file = args.data_file or os.getenv("NOTFELLCHEN_DATA_FILE", DEFAULT_OSM_DATA_FILE) data_file = args.data_file or os.getenv("NOTFELLCHEN_DATA_FILE", DEFAULT_OSM_DATA_FILE)
use_cached = args.use_cached or os.getenv("NOTFELLCHEN_USE_CACHED", False)
if not api_token or not instance: if not api_token or not instance:
raise ValueError("API token and instance URL must be provided via environment variables or CLI arguments.") raise ValueError("API token and instance URL must be provided via environment variables or CLI arguments.")
return api_token, instance, data_file return api_token, area, instance, data_file, use_cached
def get_or_none(data, key): def get_or_none(data, key):
if key in data["properties"].keys():
return data["properties"][key]
else:
return None
def get_or_empty(data, key):
if key in data["properties"].keys(): if key in data["properties"].keys():
return data["properties"][key] return data["properties"][key]
else: else:
@@ -70,6 +88,27 @@ def https(value):
return None return None
def calc_coordinate_center(coordinates):
"""
Calculates the center as the arithmetic mean of the list of coordinates.
Not perfect because earth is a sphere (citation needed) but good enough.
"""
if not coordinates:
return None, None
lon_sum = 0.0
lat_sum = 0.0
count = 0
for lon, lat in coordinates:
lon_sum += lon
lat_sum += lat
count += 1
return lon_sum / count, lat_sum / count
def get_center_coordinates(geometry): def get_center_coordinates(geometry):
""" """
Given a GeoJSON geometry dict, return (longitude, latitude) Given a GeoJSON geometry dict, return (longitude, latitude)
@@ -93,25 +132,25 @@ def get_center_coordinates(geometry):
raise ValueError(f"Unsupported geometry type: {geom_type}") raise ValueError(f"Unsupported geometry type: {geom_type}")
def calc_coordinate_center(coordinates): # TODO: take note of new get_overpass_result function which does the bulk of the new overpass query work
def get_overpass_result(area, data_file):
"""Build the Overpass query for fetching animal shelters in the specified area."""
overpass_endpoint = "https://overpass-api.de/api/interpreter"
overpass_query = f"""
[out:json][timeout:25];
area[name="{area}"]->.searchArea;
nwr["amenity"="animal_shelter"](area.searchArea);
out body;
>;
out skel qt;
""" """
Calculates the center as the arithmetic mean of the list of coordinates. r = requests.get(overpass_endpoint, params={'data': overpass_query})
if r.status_code == 200:
Not perfect because earth is a sphere (citation needed) but good enough. rjson = r.json()
""" result = osmtogeojson.process_osm_json(rjson)
if not coordinates: with open(data_file, 'w', encoding='utf-8') as f:
return None, None json.dump(result, f, ensure_ascii=False)
return result
lon_sum = 0.0
lat_sum = 0.0
count = 0
for lon, lat in coordinates:
lon_sum += lon
lat_sum += lat
count += 1
return lon_sum / count, lat_sum / count
def add_if_available(base_data, keys, result): def add_if_available(base_data, keys, result):
@@ -122,23 +161,51 @@ def add_if_available(base_data, keys, result):
return result return result
def create_location(tierheim, instance, headers):
location_data = {
"place_id": tierheim["id"],
"longitude": get_center_coordinates(tierheim["geometry"])[0],
"latitude": get_center_coordinates(tierheim["geometry"])[1],
"name": tierheim["properties"]["name"],
"city": tierheim["properties"]["addr:city"],
"housenumber": get_or_empty(tierheim, "addr:housenumber"),
"postcode": get_or_empty(tierheim, "addr:postcode"),
"street": get_or_empty(tierheim, "addr:street"),
"countrycode": get_or_empty(tierheim, "addr:country"),
}
location_result = requests.post(f"{instance}/api/locations/", json=location_data, headers=headers)
if location_result.status_code != 201:
print(
f"Location for {tierheim["properties"]["name"]}:{location_result.status_code} {location_result.json()} not created")
exit()
return location_result.json()
def main(): def main():
api_token, instance, data_file = get_config() api_token, area, instance, data_file, use_cached = get_config()
if not use_cached:
# Query shelters
overpass_result = get_overpass_result(area, data_file)
if overpass_result is None:
print("Error: get_overpass_result returned None")
return
print(f"Response type: {type(overpass_result)}")
print(f"Response content: {overpass_result}")
else:
with open(data_file, 'r', encoding='utf-8') as f:
overpass_result = json.load(f)
# Set headers and endpoint # Set headers and endpoint
endpoint = f"{instance}/api/organizations/" endpoint = f"{instance}/api/organizations/"
h = {'Authorization': f'Token {api_token}', "content-type": "application/json"} h = {'Authorization': f'Token {api_token}', "content-type": "application/json"}
with open(data_file, encoding="utf8") as f: tierheime = overpass_result["features"]
d = json.load(f)
skipped_low_quality = 0 for idx, tierheim in enumerate(tqdm(tierheime)):
tierheime = d["features"]
for idx, tierheim in enumerate(tierheime):
# Check if data is low quality # Check if data is low quality
if "name" not in tierheim["properties"].keys() or "addr:city" not in tierheim["properties"].keys(): if "name" not in tierheim["properties"].keys() or "addr:city" not in tierheim["properties"].keys():
skipped_low_quality = skipped_low_quality + 1
continue continue
# Load TH data in for easier accessing # Load TH data in for easier accessing
@@ -147,8 +214,8 @@ def main():
email=choose(("contact:email", "email"), tierheim["properties"]), email=choose(("contact:email", "email"), tierheim["properties"]),
phone_number=choose(("contact:phone", "phone"), tierheim["properties"], replace=True), phone_number=choose(("contact:phone", "phone"), tierheim["properties"], replace=True),
fediverse_profile=get_or_none(tierheim, "contact:mastodon"), fediverse_profile=get_or_none(tierheim, "contact:mastodon"),
facebook=https(add(get_or_none(tierheim, "contact:facebook"), "facebook")), facebook=https(add(get_or_empty(tierheim, "contact:facebook"), "facebook")),
instagram=https(add(get_or_none(tierheim, "contact:instagram"), "instagram")), instagram=https(add(get_or_empty(tierheim, "contact:instagram"), "instagram")),
website=https(choose(("contact:website", "website"), tierheim["properties"])), website=https(choose(("contact:website", "website"), tierheim["properties"])),
description=get_or_none(tierheim, "opening_hours"), description=get_or_none(tierheim, "opening_hours"),
external_object_identifier=tierheim["id"], external_object_identifier=tierheim["id"],
@@ -168,51 +235,31 @@ def main():
print(f"{th_data.name} already exists as ID {org_id}.") print(f"{th_data.name} already exists as ID {org_id}.")
org_patch_data = {"id": org_id, org_patch_data = {"id": org_id,
"name": th_data.name} "name": th_data.name}
if search_result.json()[0]["location"] is None:
location = create_location(tierheim, instance, h)
org_patch_data["location"] = location["id"]
add_if_available(th_data, optional_data, org_patch_data) org_patch_data = add_if_available(th_data, optional_data, org_patch_data)
result = requests.patch(endpoint, json=org_patch_data, headers=h) result = requests.patch(endpoint, json=org_patch_data, headers=h)
if result.status_code != 200: if result.status_code != 200:
print(f"Updating {tierheim['properties']['name']} failed:{result.status_code} {result.json()}") print(f"Updating {tierheim['properties']['name']} failed:{result.status_code} {result.json()}")
exit() exit()
continue continue
else:
location_data = { location = create_location(tierheim, instance, h)
"place_id": tierheim["id"],
"longitude": get_center_coordinates(tierheim["geometry"])[0],
"latitude": get_center_coordinates(tierheim["geometry"])[1],
"name": tierheim["properties"]["name"],
"city": tierheim["properties"]["addr:city"],
"housenumber": get_or_none(tierheim, "addr:housenumber"),
"postcode": get_or_none(tierheim, "addr:postcode"),
"street": get_or_none(tierheim, "addr:street"),
"countrycode": get_or_none(tierheim, "addr:country"),
}
location_result = requests.post(f"{instance}/api/locations/", json=location_data, headers=h)
if location_result.status_code != 201:
print(
f"{idx} Location for {tierheim["properties"]["name"]}:{location_result.status_code} {location_result.json()} not created")
exit()
org_data = {"name": tierheim["properties"]["name"], org_data = {"name": tierheim["properties"]["name"],
"location": location_result.json()["id"],
"external_object_identifier": f"{tierheim["id"]}", "external_object_identifier": f"{tierheim["id"]}",
"external_source_identifier": "OSM" "external_source_identifier": "OSM",
"location": location["id"]
} }
add_if_available(th_data, optional_data, org_data) org_data = add_if_available(th_data, optional_data, org_data)
result = requests.post(endpoint, json=org_data, headers=h) result = requests.post(endpoint, json=org_data, headers=h)
if result.status_code != 201: if result.status_code != 201:
print(f"{idx} {tierheim["properties"]["name"]} failed:{result.status_code} {result.json()}") print(f"{idx} {tierheim["properties"]["name"]}:{result.status_code} {result.json()}")
exit(1)
else:
print(f"{idx} - {json.loads(result.content)["id"]} {tierheim["properties"]["name"]} created")
print(f"{skipped_low_quality} datapoints skipped for low quality ({100*skipped_low_quality / len(tierheime):.2}%)")
if __name__ == "__main__": if __name__ == "__main__":