diff --git a/scripts/upload_animal_shelters.py b/scripts/upload_animal_shelters.py index 70c63fb..61d9e3b 100644 --- a/scripts/upload_animal_shelters.py +++ b/scripts/upload_animal_shelters.py @@ -4,17 +4,25 @@ import os from types import SimpleNamespace import requests +# TODO: consider using OSMPythonTools instead of requests or overpass library +from osmtogeojson import osmtogeojson from tqdm import tqdm DEFAULT_OSM_DATA_FILE = "export.geojson" +# Search area must be the official name, e.g. "Germany" is not a valid area name in Overpass API +# Consider instead finding & using the code within the query itself, e.g. "ISO3166-1"="DE" +DEFAULT_OVERPASS_SEARCH_AREA = "Deutschland" def parse_args(): """Parse command-line arguments.""" - parser = argparse.ArgumentParser(description="Upload animal shelter data to the Notfellchen API.") + parser = argparse.ArgumentParser( + description="Download animal shelter data from the Overpass API to the Notfellchen API.") parser.add_argument("--api-token", type=str, help="API token for authentication.") + parser.add_argument("--area", type=str, help="Area to search for animal shelters (default: Deutschland).") parser.add_argument("--instance", type=str, help="API instance URL.") parser.add_argument("--data-file", type=str, help="Path to the GeoJSON file containing (only) animal shelters.") + parser.add_argument("--use-cached", action='store_true', help="Use the stored GeoJSON file") return parser.parse_args() @@ -23,16 +31,26 @@ def get_config(): args = parse_args() api_token = args.api_token or os.getenv("NOTFELLCHEN_API_TOKEN") + # TODO: document new environment variable NOTFELLCHEN_AREA + area = args.area or os.getenv("NOTFELLCHEN_AREA", DEFAULT_OVERPASS_SEARCH_AREA) instance = args.instance or os.getenv("NOTFELLCHEN_INSTANCE") data_file = args.data_file or os.getenv("NOTFELLCHEN_DATA_FILE", DEFAULT_OSM_DATA_FILE) + use_cached = args.use_cached or os.getenv("NOTFELLCHEN_USE_CACHED", False) if not api_token or not instance: raise ValueError("API token and instance URL must be provided via environment variables or CLI arguments.") - return api_token, instance, data_file + return api_token, area, instance, data_file, use_cached def get_or_none(data, key): + if key in data["properties"].keys(): + return data["properties"][key] + else: + return None + + +def get_or_empty(data, key): if key in data["properties"].keys(): return data["properties"][key] else: @@ -70,6 +88,27 @@ def https(value): return None +def calc_coordinate_center(coordinates): + """ + Calculates the center as the arithmetic mean of the list of coordinates. + + Not perfect because earth is a sphere (citation needed) but good enough. + """ + if not coordinates: + return None, None + + lon_sum = 0.0 + lat_sum = 0.0 + count = 0 + + for lon, lat in coordinates: + lon_sum += lon + lat_sum += lat + count += 1 + + return lon_sum / count, lat_sum / count + + def get_center_coordinates(geometry): """ Given a GeoJSON geometry dict, return (longitude, latitude) @@ -93,25 +132,25 @@ def get_center_coordinates(geometry): raise ValueError(f"Unsupported geometry type: {geom_type}") -def calc_coordinate_center(coordinates): - """ - Calculates the center as the arithmetic mean of the list of coordinates. - - Not perfect because earth is a sphere (citation needed) but good enough. - """ - if not coordinates: - return None, None - - lon_sum = 0.0 - lat_sum = 0.0 - count = 0 - - for lon, lat in coordinates: - lon_sum += lon - lat_sum += lat - count += 1 - - return lon_sum / count, lat_sum / count +# TODO: take note of new get_overpass_result function which does the bulk of the new overpass query work +def get_overpass_result(area, data_file): + """Build the Overpass query for fetching animal shelters in the specified area.""" + overpass_endpoint = "https://overpass-api.de/api/interpreter" + overpass_query = f""" + [out:json][timeout:25]; + area[name="{area}"]->.searchArea; + nwr["amenity"="animal_shelter"](area.searchArea); + out body; + >; + out skel qt; + """ + r = requests.get(overpass_endpoint, params={'data': overpass_query}) + if r.status_code == 200: + rjson = r.json() + result = osmtogeojson.process_osm_json(rjson) + with open(data_file, 'w', encoding='utf-8') as f: + json.dump(result, f, ensure_ascii=False) + return result def add_if_available(base_data, keys, result): @@ -122,23 +161,51 @@ def add_if_available(base_data, keys, result): return result +def create_location(tierheim, instance, headers): + location_data = { + "place_id": tierheim["id"], + "longitude": get_center_coordinates(tierheim["geometry"])[0], + "latitude": get_center_coordinates(tierheim["geometry"])[1], + "name": tierheim["properties"]["name"], + "city": tierheim["properties"]["addr:city"], + "housenumber": get_or_empty(tierheim, "addr:housenumber"), + "postcode": get_or_empty(tierheim, "addr:postcode"), + "street": get_or_empty(tierheim, "addr:street"), + "countrycode": get_or_empty(tierheim, "addr:country"), + } + + location_result = requests.post(f"{instance}/api/locations/", json=location_data, headers=headers) + + if location_result.status_code != 201: + print( + f"Location for {tierheim["properties"]["name"]}:{location_result.status_code} {location_result.json()} not created") + exit() + return location_result.json() + + def main(): - api_token, instance, data_file = get_config() + api_token, area, instance, data_file, use_cached = get_config() + if not use_cached: + # Query shelters + overpass_result = get_overpass_result(area, data_file) + if overpass_result is None: + print("Error: get_overpass_result returned None") + return + print(f"Response type: {type(overpass_result)}") + print(f"Response content: {overpass_result}") + else: + with open(data_file, 'r', encoding='utf-8') as f: + overpass_result = json.load(f) + # Set headers and endpoint endpoint = f"{instance}/api/organizations/" h = {'Authorization': f'Token {api_token}', "content-type": "application/json"} - with open(data_file, encoding="utf8") as f: - d = json.load(f) + tierheime = overpass_result["features"] - skipped_low_quality = 0 - - tierheime = d["features"] - - for idx, tierheim in enumerate(tierheime): + for idx, tierheim in enumerate(tqdm(tierheime)): # Check if data is low quality if "name" not in tierheim["properties"].keys() or "addr:city" not in tierheim["properties"].keys(): - skipped_low_quality = skipped_low_quality + 1 continue # Load TH data in for easier accessing @@ -147,8 +214,8 @@ def main(): email=choose(("contact:email", "email"), tierheim["properties"]), phone_number=choose(("contact:phone", "phone"), tierheim["properties"], replace=True), fediverse_profile=get_or_none(tierheim, "contact:mastodon"), - facebook=https(add(get_or_none(tierheim, "contact:facebook"), "facebook")), - instagram=https(add(get_or_none(tierheim, "contact:instagram"), "instagram")), + facebook=https(add(get_or_empty(tierheim, "contact:facebook"), "facebook")), + instagram=https(add(get_or_empty(tierheim, "contact:instagram"), "instagram")), website=https(choose(("contact:website", "website"), tierheim["properties"])), description=get_or_none(tierheim, "opening_hours"), external_object_identifier=tierheim["id"], @@ -168,51 +235,31 @@ def main(): print(f"{th_data.name} already exists as ID {org_id}.") org_patch_data = {"id": org_id, "name": th_data.name} + if search_result.json()[0]["location"] is None: + location = create_location(tierheim, instance, h) + org_patch_data["location"] = location["id"] - add_if_available(th_data, optional_data, org_patch_data) + org_patch_data = add_if_available(th_data, optional_data, org_patch_data) result = requests.patch(endpoint, json=org_patch_data, headers=h) if result.status_code != 200: print(f"Updating {tierheim['properties']['name']} failed:{result.status_code} {result.json()}") exit() continue - - location_data = { - "place_id": tierheim["id"], - "longitude": get_center_coordinates(tierheim["geometry"])[0], - "latitude": get_center_coordinates(tierheim["geometry"])[1], - "name": tierheim["properties"]["name"], - "city": tierheim["properties"]["addr:city"], - "housenumber": get_or_none(tierheim, "addr:housenumber"), - "postcode": get_or_none(tierheim, "addr:postcode"), - "street": get_or_none(tierheim, "addr:street"), - "countrycode": get_or_none(tierheim, "addr:country"), - } - - location_result = requests.post(f"{instance}/api/locations/", json=location_data, headers=h) - - if location_result.status_code != 201: - print( - f"{idx} Location for {tierheim["properties"]["name"]}:{location_result.status_code} {location_result.json()} not created") - exit() - - org_data = {"name": tierheim["properties"]["name"], - "location": location_result.json()["id"], - "external_object_identifier": f"{tierheim["id"]}", - "external_source_identifier": "OSM" - } - - add_if_available(th_data, optional_data, org_data) - - result = requests.post(endpoint, json=org_data, headers=h) - - if result.status_code != 201: - print(f"{idx} {tierheim["properties"]["name"]} failed:{result.status_code} {result.json()}") - exit(1) else: - print(f"{idx} - {json.loads(result.content)["id"]} {tierheim["properties"]["name"]} created") + location = create_location(tierheim, instance, h) + org_data = {"name": tierheim["properties"]["name"], + "external_object_identifier": f"{tierheim["id"]}", + "external_source_identifier": "OSM", + "location": location["id"] + } - print(f"{skipped_low_quality} datapoints skipped for low quality ({100*skipped_low_quality / len(tierheime):.2}%)") + org_data = add_if_available(th_data, optional_data, org_data) + + result = requests.post(endpoint, json=org_data, headers=h) + + if result.status_code != 201: + print(f"{idx} {tierheim["properties"]["name"]}:{result.status_code} {result.json()}") if __name__ == "__main__":