In [None]:
import pandas as pd
import numpy as np
import datetime as dt

import json

import requests
import os
import random
import time

In [None]:
import OSM_selenium_config

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

options = Options()
# configure Selenium with existing user profile (i.e. logged into OSM account)
options.add_argument(OSM_selenium_config.chrome_location)
options.add_argument(OSM_selenium_config.profile_location)

driver = webdriver.Chrome(options=options)

In [None]:
data = pd.read_excel('data/OSM_survey_data.xlsx')
columns = list(data.columns.values.copy())
data = data.set_axis([1,2,3,4,5,6,7], axis=1)
contributors = data[1].to_list()

In [None]:
your_header = ''
headers = {
    'User-Agent': f'https://github.com/{your_header}'}

In [None]:
if not os.path.exists('OSM_users_updated'):
    os.mkdir('OSM_users_updated')
if os.path.exists('OSM_users_updated'):
    print('"user" data folder already exists. Please check data before overwriting.')

In [None]:
def get_user_id(user: str):
    user_id = requests.get(
        f'https://api.openstreetmap.org/api/0.6/changesets.json?display_name={user}', 
        headers=headers).json()['changesets'][0]['uid']
    return user_id

In [None]:
def get_user_reg_date(user_id: int):
    reg_date = requests.get(
        f'https://api.openstreetmap.org/api/0.6/user/{user_id}.json', 
        headers=headers).json()['user']['account_created']
    return reg_date

In [None]:
def get_user_changesets(user_id: int):
    changesets = requests.get(
        f'https://api.openstreetmap.org/api/0.6/user/{user_id}.json', 
        headers=headers).json()['user']['changesets']['count']
    return changesets

In [None]:
def query_osm_api(contributors: list):
    
    for user in contributors:
        
        print(f"Starting user '{user}' at {dt.datetime.now().strftime('%d/%m/%Y %H:%M:%S')}")
        
        # check if the user already has a directory, and if not create one
        if not os.path.exists(f'OSM_users_updated/{user}'):
            os.mkdir(f'OSM_users_updated/{user}')
            
        # set up information for looping through user changesets
        try:
            user_id = get_user_id(user)
            reg_date = get_user_reg_date(user_id)
            changesets = get_user_changesets(user_id)
        except:
            print(f'User: {user} not found.')
            continue
            
        # calculate the total number of files for given user
        file_count = int(np.ceil(changesets/100))
        
        # check if any changesets for given user have already been downloaded
        try:
            already_downloaded = len(os.listdir(f'OSM_users_updated/{user}'))
            if already_downloaded > 0:
                print(f"{already_downloaded} files exist for user '{user}'")
                last_fn = [fn for fn in os.listdir(
                    f'OSM_users_updated/{user}/') if fn.startswith(f'{user}_{((file_count - already_downloaded) + 1)}_')][0]
                with open(f'OSM_users_updated/{user}/{last_fn}') as f:
                    last_file = json.load(f)
                t_last = last_file['changesets'][-1]['created_at']
        except:
            print(f"No files for user '{user}' exist. Starting download from scratch.")
            already_downloaded = 0
            t_last = None
            pass
        
        # get changesets in groups of 100
        for i in range(1, (file_count+1)):

            if i <= already_downloaded:
                if i == already_downloaded:
                    t0 = t_last
                continue

            if i == 1:
                response = requests.get(
                    f'https://api.openstreetmap.org/api/0.6/changesets.json?display_name={user}&time={reg_date}',
                    headers=headers)
            
            else:
                response = requests.get(
                    f'https://api.openstreetmap.org/api/0.6/changesets.json?display_name={user}&time={reg_date},{t0}', 
                    headers=headers)
            
            try:
                t0 = response.json()['changesets'][-1]['created_at']

            except:
                print('No more changesets - early break.')
                break

            try:
                start = response.json()['changesets'][-1]['created_at'][:10]
                end = response.json()['changesets'][0]['created_at'][:10]
                
                with open(f'OSM_users_updated/{user}/{user}_{((file_count+1)-i)}_{start}_{end}.json', 'w') as f:
                    json.dump(response.json(), f)

            except:
                print('No more changesets.')
                break

            time.sleep(random.randint(1,3))

In [None]:
query_osm_api(contributors)

In [None]:
def combine_jsons(contributors: list):
    
    if not os.path.exists('OSM_users_full_updated/'):
        os.mkdir('OSM_users_full_updated/')
    
    for user in contributors:
        filenames = [fn for fn in os.listdir(f'OSM_users_updated/{user}/') if fn.endswith('.json')]
        filenames.sort(key=lambda x: int(x.removeprefix(f'{user}').split('_')[1]))
        
        if len(filenames) > 0:
        
            user_list = []
            for fn in filenames:
                with open(f'OSM_users_updated/{user}/{fn}', 'r') as file:
                    user_list.extend(json.load(file)['changesets'])
        
        else:
            continue

        with open(f'OSM_users_full_updated/{user}.json', 'w') as out_file:
            json.dump(user_list, out_file)

In [None]:
combine_jsons(contributors)

In [None]:
data2 = {}
for user in contributors:
    print(f'Starting user: {user}')
    data2[f'{user}'] = {}

    driver.get(f'https://hdyc.neis-one.org/?{user}')
    
    try:
        for i, j in enumerate(['Created', 'Modified', 'Deleted'], 3):
            data2[f'{user}'][j]={}
            for k, l in enumerate(['Nodes', 'Ways', 'Relations'], 2):
                data2[f'{user}'][j][l] = driver.find_element(
                    By.XPATH, 
                    f'/html/body/div[3]/div[4]/table/tbody/tr[{k}]/td[{i}]'
                ).text
    except NoSuchElementException:
        print(f'\t User {user} not found. Deleting dictionary for {user}.')
        del data2[f'{user}']
        continue

In [None]:
users = []
dfs = []
for user, dic in data2.items():
    users.append(user)
    dfs.append(pd.DataFrame.from_dict(dic, orient='index'))
df = pd.concat(dfs, keys=users)
df = df.apply(lambda x: x.str.split(' ').str[0].str.replace(',', '').astype(int), axis=1)

In [None]:
OUTPUT = False

if OUTPUT == True:
    df.to_csv('data/create-modify-delete_stats.csv')