import contextily as cx
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
from adjustText import adjust_text
from sklearn.cluster import DBSCAN
import seaborn as sns

df = pd.read_csv('data/stops_gps.csv', delimiter=';')
df2 = pd.read_csv('data/raw_gps.csv', delimiter=';')

df.shape

(1035, 6)

# Convert timestamps from str to Timestamp class 
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])

df2['timestamp'] = pd.to_datetime(df2['time_system_ts'])

# Convert latitude, longitude and radius from str to float
df['lng'] = df['lng'].str.replace(',', '.').astype(float)
df['lat'] = df['lat'].str.replace(',', '.').astype(float)
df['radius'] = df['radius'].str.replace(',', '.').astype(float)

df2['lng'] = df2.apply(lambda row: float(row.x.replace(',', '.')), axis=1)
df2['lat'] = df2.apply(lambda row: float(row.y.replace(',', '.')), axis=1)

# Start time should be before end time
df[df['start_time'] >= df['end_time']] # Rows with said error (result: 0)

# end_time - start_time should equal duration_mills
print(
    'Rows with wrong duration:',
    len(df[df['duration_millis'] != (df['end_time'] - df['start_time']).dt.total_seconds() * 1000])
)

Rows with wrong duration: 780

# Fix duration
df['duration_millis'] = (df['end_time'] - df['start_time']).dt.total_seconds() * 1000

# Check for missing values
df.isnull().any(axis = 1).sum()

0

# Radius over 70 (total: 9)
df[df['radius'] > 70]

df['point'] = df.apply(lambda row: Point(row['lng'], row['lat']), axis=1)
df2['point'] = df2.apply(lambda row: Point(row.lng, row.lat), axis=1)

gdf_points = gpd.GeoDataFrame(df, geometry='point', crs='EPSG:4326')
gdf2_points = gpd.GeoDataFrame(df2, geometry='point', crs='EPSG:4326')

municipalities = gpd.read_file('data/municipalities.geojson')
municipalities = municipalities.to_crs("EPSG:4326")

gdf_with_municipalities = gpd.sjoin(gdf_points, municipalities, how="left", predicate="within")
df_mun = gdf_with_municipalities[[
    'start_time',
    'end_time', 
    'duration_millis',
    'lng',
    'lat',
    'radius',
    'ONIMI',
    'point']
].rename(columns={'ONIMI': 'municipality'})

gdf2_with_municipalities = gpd.sjoin(gdf2_points, municipalities, how="left", predicate="within")
df2_mun = gdf2_with_municipalities[[
    'timestamp',
    'accuracy',
    'speed',
    'altitude',
    'lng',
    'lat',
    'ONIMI',
    'point']
].rename(columns={'ONIMI': 'municipality'})

print(
    'Number of rows where municipality was not found: ',
    len(df_mun[df_mun['municipality'].isnull()])
)

Number of rows where municipality was not found:  74

print('Stops outside of Estonia:')

world = gpd.read_file('data/world.geojson')

fig, ax = plt.subplots(figsize=(15, 10))
world.plot(ax=ax, color='lightgrey', edgecolor='black')
df_mun.plot(ax=ax, color='red', markersize=5)

xmin, ymin, xmax, ymax = df_mun.total_bounds
ax.set_xlim(xmin - 1, xmax + 1)
ax.set_ylim(ymin - 1, ymax + 1)

plt.show()

Stops outside of Estonia:

df_mun['start_year'] = df_mun['start_time'].dt.year
df_mun['end_year'] = df_mun['end_time'].dt.year
df_mun['start_month'] = df_mun['start_time'].dt.month
df_mun['end_month'] = df_mun['end_time'].dt.month
df_mun['start_day'] = df_mun['start_time'].dt.day
df_mun['start_day_of_week'] = df_mun['start_time'].dt.dayofweek
df_mun['start_week'] = df_mun['start_time'].dt.isocalendar().week
df_mun['end_day'] = df_mun['end_time'].dt.day
df_mun['start_hour'] = df_mun['start_time'].dt.hour
df_mun['end_hour'] = df_mun['end_time'].dt.hour

# Using metric system for distance calculations
df_mun_metric = df_mun.to_crs(epsg=3857)

df2_mun['year'] = df2_mun['timestamp'].dt.year
df2_mun['month'] = df2_mun['timestamp'].dt.month
df2_mun['day'] = df2_mun['timestamp'].dt.day
df2_mun['hour'] = df2_mun['timestamp'].dt.hour
df2_mun['time'] = df2_mun['timestamp'].dt.time

# Get Tartu municipality
tartu = municipalities[municipalities['ONIMI'] == 'Tartu linn']

coords = df_mun[['lng', 'lat']].dropna().to_numpy()

# Convert degrees to radians for haversine
coords_rad = np.radians(coords)

# DBSCAN clustering with Haversine metric
kms_per_radian = 6371.0088
epsilon = 0.025 / kms_per_radian  #Neighboring radius 25 meters

db = DBSCAN(
    eps=epsilon,
    min_samples=10, # Cluster must have at lest n nearby points
    algorithm='auto',
    metric='haversine'
).fit(coords_rad)

# Assign cluster labels to the dataframe
df_mun['location_cluster'] = db.labels_

HOME = 5
WORK = 2
WORK2 = 0
WERNER = 7
CREPP = 4
GUEST_HOUSE = 3
SUMMER_HOUSE = 1
PARIS = 6

cluster_labels = [(5, 'Home'), (2, 'Work'), (0, 'Work'), (7, 'Werner'), (4, 'Crepp'), (3, 'Guest house'), (1, 'Summer house')]

# Collect cluster center points and labels
centers = []
labels = []

for index, label in cluster_labels:
    cluster_data = df_mun[df_mun['location_cluster'] == index]
    lng = cluster_data['lng'].mean()
    lat = cluster_data['lat'].mean()

    centers.append(Point(lng, lat))
    labels.append(label)

# Create GeoDataFrame
locations = gpd.GeoDataFrame({'label': labels}, geometry=centers, crs='EPSG:4326')
locations['coords'] = locations['geometry'].apply(lambda l: f"{l.y},{l.x}")

from adjustText import adjust_text
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(15, 10))
tartu.plot(ax=ax, color='lightgray', edgecolor='black')
locations[:-2].plot(ax=ax, color='red', markersize=10)

texts = []
for idx, row in locations[:-2].iterrows():
    texts.append(
        ax.text(
            row.geometry.x, row.geometry.y, row['label'],
            fontsize=14, ha='left', va='bottom', color='red',
            bbox=dict(facecolor='white', alpha=1, edgecolor='none', pad=1.5)
        )
    )
adjust_text(
    texts, ax=ax,
    expand_text=(1.3, 1.8),
    expand_points=(1.2, 1.9),
    force_text=1,
    force_points=1,
    arrowprops=dict(arrowstyle='->', color='black', lw=1)
)

plt.show()

Looks like you are using a tranform that doesn't support FancyArrowPatch, using ax.annotate instead. The arrows might strike through texts. Increasing shrinkA in arrowprops might help.

from adjustText import adjust_text
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(15, 10))
municipalities.plot(ax=ax, color='lightgray', edgecolor='black')
locations.plot(ax=ax, color='red', markersize=10)

texts = []
for idx, row in locations.iterrows():
    texts.append(
        ax.text(
            row.geometry.x, row.geometry.y, row['label'],
            fontsize=14, ha='left', va='bottom', color='red',
            bbox=dict(facecolor='white', alpha=1, edgecolor='none', pad=1.5)
        )
    )
adjust_text(
    texts, ax=ax,
    expand_text=(1.3, 1.8),
    expand_points=(1.2, 1.9),
    force_text=1,
    force_points=1,
    arrowprops=dict(arrowstyle='->', color='black', lw=1)
)

plt.show()

#https://www.openstreetmap.org/relation/4572218#map=15/58.37774/26.72394
tartu_center = gpd.read_file('data/tartu_center.geojson')

print('Tartu center')
fig, ax = plt.subplots(figsize=(15, 10))
tartu_center.geometry.plot(ax=ax, color='lightgrey', edgecolor='black')
plt.show()

Tartu center

print('Stops in center per week/hours')
heatmap_data = df_mun.groupby(['start_day_of_week', 'start_hour']).size().unstack(fill_value=0)

plt.figure(figsize=(14, 6))
sns.heatmap(heatmap_data, cmap='YlOrRd', linewidths=0.5, annot=True, fmt='d')

plt.xlabel('Hour of Day')
plt.ylabel('Day of Week')
plt.yticks(ticks=[0.5 + i for i in range(7)], labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], rotation=0)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

Stops in center per week/hours

df_mun_sorted = df_mun.sort_values(by='start_time').copy()

df_mun_sorted['travel_time'] = df_mun_sorted['start_time'] - df_mun_sorted['end_time'].shift()

df_mun_sorted_3301 = df_mun_sorted.to_crs(epsg=3301)

df_mun_sorted_3301['prev_point'] = df_mun_sorted_3301['point'].shift()
df_mun_sorted_3301['travel_distance_m'] = df_mun_sorted_3301.distance(df_mun_sorted_3301['prev_point'])
df_mun_sorted_3301['travel_distance_km'] = df_mun_sorted_3301['travel_distance_m'] / 1000

df_mun_sorted_3301['travel_km_h'] = (
        df_mun_sorted_3301['travel_distance_km'] / (df_mun_sorted['travel_time'].dt.total_seconds() / 3600))

conditions = [
    df_mun_sorted_3301['travel_km_h'] <= 5,  # Walking
    (df_mun_sorted_3301['travel_km_h'] > 5) & (df_mun_sorted_3301['travel_km_h'] <= 15),  # Cycling
    (df_mun_sorted_3301['travel_km_h'] > 15) & (df_mun_sorted_3301['travel_km_h'] <= 40),  # Public Transport
    (df_mun_sorted_3301['travel_km_h'] > 40) & (df_mun_sorted_3301['travel_km_h'] <= 200),  # Private Car
    df_mun_sorted_3301['travel_km_h'] > 200  # Plane
]

choices = ['walking', 'cycling', 'public_transport', 'private_car', 'plane']

df_mun_sorted_3301['transport_form'] = np.select(conditions, choices, default='unknown')

print('Transport Form Counts by Day')

grouped = (df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
           .groupby(['start_day_of_week', 'transport_form'])
           .size()
           .unstack(fill_value=0)
        )

ax = grouped.plot(kind='bar', figsize=(10, 6))

ax.set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], rotation=0)

for container in ax.containers:
    ax.bar_label(container, label_type='edge', fontsize=8)

ax.set_xlabel('Day of week')
ax.set_ylabel('Count')
ax.legend(title='Transport Form')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Transport Form Counts by Day

print('Transport Form Counts by Week')

grouped = (df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
           .groupby(['start_week', 'transport_form'])
           .size()
           .unstack(fill_value=0)
        )

ax = grouped.plot(kind='bar', figsize=(10, 6))

ax.set_xticklabels([27, 28, 29, 30, 31, 32, 33, 34, 35, 36], rotation=0)

for container in ax.containers:
    ax.bar_label(container, label_type='edge', fontsize=8)

ax.set_xlabel('Week')
ax.set_ylabel('Count')
ax.legend(title='Transport Form')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Transport Form Counts by Week

print('Transport Form Counts by Month')

grouped = (df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
           .groupby(['start_month', 'transport_form'])
           .size()
           .unstack(fill_value=0)
        )

ax = grouped.plot(kind='bar', figsize=(10, 6))

ax.set_xticklabels(['July', 'August'], rotation=0)

for container in ax.containers:
    ax.bar_label(container, label_type='edge', fontsize=8)

ax.set_xlabel('Month')
ax.set_ylabel('Count')
ax.legend(title='Transport Form')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Transport Form Counts by Month

print('Transport Form Counts by Year')

grouped = (df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
           .groupby(['start_year', 'transport_form'])
           .size()
           .unstack(fill_value=0)
        )

ax = grouped.plot(kind='bar', figsize=(10, 6))

ax.set_xticklabels(['2014', '2015'], rotation=0)

for container in ax.containers:
    ax.bar_label(container, label_type='edge', fontsize=8)

ax.set_xlabel('Year')
ax.set_ylabel('Count')
ax.legend(title='Transport Form')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Transport Form Counts by Year

print('Mileage by Day')

ax = df_mun_sorted_3301.groupby(['start_day_of_week'])['travel_distance_km'].sum().plot(
    kind='bar',
    figsize=(10, 6)
)

ax.set_xlabel('Day of the Week')
ax.set_ylabel('Mileage (km)')
ax.set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], rotation=0)

plt.tight_layout()
plt.show()

Mileage by Day

print('Mileage by Day')

ax = (
    df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
    .groupby(['start_day_of_week', 'transport_form'])['travel_distance_km']
    .sum()
    .unstack(fill_value=0) 
    ).plot(kind='bar', stacked=True,  figsize=(12, 6))

ax.set_xlabel('Day of the Week')
ax.set_ylabel('Mileage (km)')
ax.set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], rotation=0)

plt.tight_layout()
plt.show()

Mileage by Day

print('Mileage by Day (in %)')

daily_counts = (
    df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
    .groupby(['start_day_of_week', 'transport_form'])['travel_distance_km']
    .sum()
    .unstack(fill_value=0) 
)

daily_percentages = daily_counts.div(daily_counts.sum(axis=1), axis=0) * 100

daily_percentages = daily_percentages.sort_index()

ax = daily_percentages.plot(kind='bar', stacked=True, figsize=(12, 6))

ax.set_xlabel('Day of the Week')
ax.set_ylabel('Share of Trips (%)')
ax.set_xticks(range(7))
ax.set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], rotation=0)

plt.tight_layout()
plt.show()

Mileage by Day (in %)

print('Mileage by Week')

ax = df_mun_sorted_3301.groupby(['start_week'])['travel_distance_km'].sum().plot(
    kind='bar',
    figsize=(10, 6)
)

ax.set_xlabel('Week')
ax.set_ylabel('Mileage (km)')
ax.set_xticklabels([27, 28, 29, 30, 31, 32, 33, 34, 35, 36], rotation=0)

plt.tight_layout()
plt.show()

Mileage by Week

print('Mileage by Week')

ax = (
    df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
    .groupby(['start_week', 'transport_form'])['travel_distance_km']
    .sum()
    .unstack(fill_value=0)
).plot(kind='bar', stacked=True,  figsize=(12, 6))

ax.set_xlabel('Week')
ax.set_ylabel('Mileage (km)')
ax.set_xticklabels([27, 28, 29, 30, 31, 32, 33, 34, 35, 36], rotation=0)

plt.tight_layout()
plt.show()

Mileage by Week

print('Mileage by Week (in %)')

weekly_counts = (
    df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
    .groupby(['start_week', 'transport_form'])['travel_distance_km']
    .sum()
    .unstack(fill_value=0)
)

weekly_percentages = weekly_counts.div(weekly_counts.sum(axis=1), axis=0) * 100

ax = weekly_percentages.plot(kind='bar', stacked=True, figsize=(12, 6))

ax.set_xlabel('Week')
ax.set_ylabel('Share of Trips (%)')
ax.set_xticks(range(len(weekly_percentages)))
ax.set_xticklabels(weekly_percentages.index, rotation=0)

plt.legend(title='Transport Form', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

Mileage by Week (in %)

print('Mileage by Month')

ax = df_mun_sorted_3301.groupby(['start_month'])['travel_distance_km'].sum().plot(
    kind='bar',
    figsize=(10, 6)
)

ax.set_xlabel('Month')
ax.set_ylabel('Mileage (km)')
ax.set_xticklabels(['July', 'August'], rotation=0)

plt.tight_layout()
plt.show()

Mileage by Month

print('Mileage by Month')

ax = (
    df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
    .groupby(['start_month', 'transport_form'])['travel_distance_km']
    .sum()
    .unstack(fill_value=0)
).plot(kind='bar', stacked=True,  figsize=(12, 6))

ax.set_xlabel('Month')
ax.set_ylabel('Mileage (km)')
ax.set_xticklabels(['July', 'August'], rotation=0)

plt.tight_layout()
plt.show()

Mileage by Month

print('Mileage by Monthly (in %)')

monthly_counts = (
    df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
    .groupby(['start_month', 'transport_form'])['travel_distance_km']
    .sum()
    .unstack(fill_value=0)
)

monthly_percentages = monthly_counts.div(monthly_counts.sum(axis=1), axis=0) * 100

ax = monthly_percentages.plot(kind='bar', stacked=True, figsize=(12, 6))

ax.set_xlabel('Month')
ax.set_ylabel('Share of Trips (%)')
ax.set_xticks(range(len(monthly_percentages)))
ax.set_xticklabels(monthly_percentages.index, rotation=0)

plt.legend(title='Transport Form', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

Mileage by Monthly (in %)

print('Mileage by Year')

ax = df_mun_sorted_3301.groupby(['start_year'])['travel_distance_km'].sum().plot(
    kind='bar',
    figsize=(10, 6)
)

ax.set_xlabel('Year')
ax.set_ylabel('Mileage (km)')
ax.set_xticklabels(['2014', '2015'], rotation=0)

plt.tight_layout()
plt.show()

Mileage by Year

print('Mileage by Year')

ax = (
    df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
    .groupby(['start_year', 'transport_form'])['travel_distance_km']
    .sum()
    .unstack(fill_value=0)
).plot(kind='bar', stacked=True,  figsize=(12, 6))

ax.set_xlabel('Year')
ax.set_ylabel('Mileage (km)')
ax.set_xticklabels(['2014', '2015'], rotation=0)

plt.tight_layout()
plt.show()

Mileage by Year

print('Mileage by Year (in %)')

yearly_counts = (
    df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
    .groupby(['start_year', 'transport_form'])['travel_distance_km']
    .sum()
    .unstack(fill_value=0)
)

yearly_percentages = yearly_counts.div(yearly_counts.sum(axis=1), axis=0) * 100

ax = yearly_percentages.plot(kind='bar', stacked=True, figsize=(12, 6))

ax.set_xlabel('Year')
ax.set_ylabel('Share of Trips (%)')
ax.set_xticks(range(len(yearly_percentages)))
ax.set_xticklabels(yearly_percentages.index, rotation=0)

plt.legend(title='Transport Form', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

Mileage by Year (in %)

df_mun['in_tartu'] = df_mun['municipality'] == 'Tartu linn'
df2_mun['in_tartu'] = df2_mun['municipality'] == 'Tartu linn'

print('Points in Estonia and inside of Tartu')
fig, ax = plt.subplots(figsize=(15, 10))
municipalities.plot(ax=ax, color='lightgrey', edgecolor='black')
tartu.plot(ax=ax, color='green')
df_mun[df_mun['in_tartu'] == True].plot(ax=ax, color='red', markersize=5)

plt.show()

Points in Estonia and inside of Tartu

print('Points in Estonia and outside of Tartu')
fig, ax = plt.subplots(figsize=(15, 10))
municipalities.plot(ax=ax, color='lightgrey', edgecolor='black')
tartu.plot(ax=ax, color='green')
df_mun[(df_mun['in_tartu'] != True) & (df_mun['municipality'].isnull() == False)].plot(ax=ax, color='red', markersize=5)

plt.show()

Points in Estonia and outside of Tartu

print(
    'Stops that have different start and end year:',
    df_mun[df_mun['start_year'] != df_mun['end_year']].size
)

Stops that have different start and end year: 0

print('Stops:')

print(f"Stops in Tartu (2014): {len(df_mun[(df_mun['in_tartu'] == True) & (df_mun['start_year'] == 2014)])}")
print(f"Stops outside Tartu (2014): {len(df_mun[(df_mun['in_tartu'] == False) & (df_mun['start_year'] == 2014)])}")

print(f"Stops in Tartu (2015): {len(df_mun[(df_mun['in_tartu'] == True) & (df_mun['start_year'] == 2015)])}")
print(f"Stops outside Tartu (2015): {len(df_mun[(df_mun['in_tartu'] == False) & (df_mun['start_year'] == 2015)])}")

Stops:
Stops in Tartu (2014): 247
Stops outside Tartu (2014): 57
Stops in Tartu (2015): 633
Stops outside Tartu (2015): 98

print('Raw data:')

print(f"Activities in Tartu (2014): {len(df2_mun[(df2_mun['in_tartu'] == True) & (df2_mun['year'] == 2014)])}")
print(f"Activities outside Tartu (2014): {len(df2_mun[(df2_mun['in_tartu'] == False) & (df2_mun['year'] == 2014)])}")

print(f"Activities in Tartu (2015): {len(df2_mun[(df2_mun['in_tartu'] == True) & (df2_mun['year'] == 2015)])}")
print(f"Activities outside Tartu (2015): {len(df2_mun[(df2_mun['in_tartu'] == False) & (df2_mun['year'] == 2015)])}")

Raw data:
Activities in Tartu (2014): 66865
Activities outside Tartu (2014): 48644
Activities in Tartu (2015): 53029
Activities outside Tartu (2015): 62673

print('Hourly raw data per year outside of Tartu')

counts_2014 = df2_mun[(df2_mun['year'] == 2014) & (df2_mun['in_tartu'] == False)].groupby('hour').count()['timestamp']
counts_2015 = df2_mun[(df2_mun['year'] == 2015) & (df2_mun['in_tartu'] == False)].groupby('hour').count()['timestamp']

hours = list(range(24))
counts_2014 = counts_2014.reindex(hours, fill_value=0)
counts_2015 = counts_2015.reindex(hours, fill_value=0)

width = 0.4
plt.figure(figsize=(12, 6))
plt.bar([h - width/2 for h in hours], counts_2014, width=width, label='2014')
plt.bar([h + width/2 for h in hours], counts_2015, width=width, label='2015')

plt.xlabel('Hour')
plt.ylabel('Count of Raw data')
plt.xticks(hours)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

Hourly raw data per year outside of Tartu

print('Hourly raw data per month outside of Tartu')

counts_july = df2_mun[(df2_mun['month'] == 7) & (df2_mun['in_tartu'] == False)].groupby('hour').count()['timestamp']
counts_august = df2_mun[(df2_mun['month'] == 8) & (df2_mun['in_tartu'] == False)].groupby('hour').count()['timestamp']

hours = list(range(24))
counts_july = counts_july.reindex(hours, fill_value=0)
counts_august = counts_august.reindex(hours, fill_value=0)

width = 0.4
plt.figure(figsize=(12, 6))
plt.bar([h - width/2 for h in hours], counts_july, width=width, label='July')
plt.bar([h + width/2 for h in hours], counts_august, width=width, label='August')

plt.xlabel('Hour')
plt.ylabel('Count of Raw data')
plt.xticks(hours)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

Hourly raw data per month outside of Tartu

print('Hourly stop data per month outside of Tartu')

counts_july = df_mun[(df_mun['start_month'] == 7) & (df_mun['in_tartu'] == False)].groupby('start_hour').count()['start_month']
counts_august = df_mun[(df_mun['start_month'] == 8) & (df_mun['in_tartu'] == False)].groupby('start_hour').count()['start_month']

hours = list(range(24))
counts_july = counts_july.reindex(hours, fill_value=0)
counts_august = counts_august.reindex(hours, fill_value=0)

width = 0.4
plt.figure(figsize=(12, 6))
plt.bar([h - width/2 for h in hours], counts_july, width=width, label='July')
plt.bar([h + width/2 for h in hours], counts_august, width=width, label='August')

plt.xlabel('Hour')
plt.ylabel('Count of Raw data')
plt.xticks(hours)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

Hourly stop data per month outside of Tartu

import warnings
warnings.filterwarnings('ignore')

print('Average Time Use per Hour by Location')

cluster_map = {
    CREPP: 'Crepp',
    WERNER: 'Werner',
    HOME: 'Home',
    WORK: 'Work',
    WORK2: 'Work2'
}
df_mun['location_label'] = df_mun['location_cluster'].map(cluster_map)

hours = list(range(24))
locations = df_mun['location_label'].dropna().unique()
time_matrix = pd.DataFrame(0, index=hours, columns=locations)

for _, row in df_mun.iterrows():
    label = row['location_label']
    if pd.isna(label):
        continue

    start = int(row['start_hour'])
    end = int(row['end_hour'])
    duration = row['duration_millis'] / (1000 * 60 * 60)  # to hours

    if end < start:
        end += 24

    total_hours = end - start if end > start else 1
    hourly_share = duration / total_hours if total_hours > 0 else 0

    for h in range(start, end):
        time_matrix.at[h % 24, label] += hourly_share

plt.figure(figsize=(15, 8))
bottom = np.zeros(len(time_matrix))

for label in time_matrix.columns:
    plt.bar(time_matrix.index, time_matrix[label], bottom=bottom, label=label)
    bottom += time_matrix[label].values

plt.xlabel("Hour of Day")
plt.ylabel("Total Time Spent (Hours)")
plt.xticks(range(24))
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

Average Time Use per Hour by Location

df_mun.dtypes

start_time           datetime64[ns]
end_time             datetime64[ns]
duration_millis             float64
lng                         float64
lat                         float64
radius                      float64
municipality                 object
point                      geometry
start_year                    int32
end_year                      int32
start_month                   int32
end_month                     int32
start_day                     int32
start_day_of_week             int32
start_week                   UInt32
end_day                       int32
start_hour                    int32
end_hour                      int32
location_cluster              int64
in_tartu                       bool
location_label               object
dtype: object

	start_time	end_time	duration_millis	lng	lat	radius
15	2014-07-01 11:34:35	2014-07-01 11:41:44	429000.0	25.965064	58.363173	73.522475
52	2014-07-31 19:42:00	2014-08-01 13:48:11	65171000.0	26.716240	58.375423	92.589390
72	2014-08-31 01:50:24	2014-08-31 02:24:12	2028000.0	26.717806	58.376259	72.659606
150	2014-07-11 23:49:22	2014-07-12 01:39:03	6581000.0	26.721638	58.381380	77.130602
414	2015-07-26 13:47:30	2015-07-26 14:03:37	967000.0	2.299585	48.857739	74.427139
597	2015-08-03 13:38:26	2015-08-03 13:45:36	430000.0	26.722594	58.374064	71.034769
658	2015-08-18 10:39:21	2015-08-18 11:01:16	1315000.0	26.720888	58.380411	71.856091
694	2015-07-24 18:36:04	2015-07-24 18:44:13	489000.0	2.347044	48.853571	73.398368
881	2015-08-01 17:39:26	2015-08-01 18:26:20	2814000.0	26.608733	58.378765	78.392248

Check for errors:¶

Add municipalities¶

Prepare data analysis¶

Detect meaningful places: home, work and other meaningful locations from the data! Describe the methodology and output, visualise it (e.g. map)!¶

Describe the temporal patterns of visiting the city centre of Tartu!¶

How to describe the mobility (e.g. daily/weekly/monthly/seasonal mileage; transport mode [walking, cycling, public transportation, private car, etc])?¶

How has mobility changed between the year 2014 to 2015?¶

How often is the respondent out of Tartu? Can you detect regularities?¶

Please describe the average daily time use of the respondent (time at home, working place, summerhouse, in movement)¶

Can you detect any other interesting patterns in the data?¶