Note: The borders of Tartu have changed after the municipality reform, so areas considered not Tartu are in Tartu after 2017. E.g. 'Haage küla', 'Ilmatsalu küla', etc.

In [2]:
import contextily as cx
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
from adjustText import adjust_text
from sklearn.cluster import DBSCAN
import seaborn as sns
In [3]:
df = pd.read_csv('data/stops_gps.csv', delimiter=';')
df2 = pd.read_csv('data/raw_gps.csv', delimiter=';')
In [4]:
df.shape
Out[4]:
(1035, 6)
In [5]:
# Convert timestamps from str to Timestamp class 
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])

df2['timestamp'] = pd.to_datetime(df2['time_system_ts'])

# Convert latitude, longitude and radius from str to float
df['lng'] = df['lng'].str.replace(',', '.').astype(float)
df['lat'] = df['lat'].str.replace(',', '.').astype(float)
df['radius'] = df['radius'].str.replace(',', '.').astype(float)

df2['lng'] = df2.apply(lambda row: float(row.x.replace(',', '.')), axis=1)
df2['lat'] = df2.apply(lambda row: float(row.y.replace(',', '.')), axis=1)

Check for errors:¶

In [6]:
# Start time should be before end time
df[df['start_time'] >= df['end_time']] # Rows with said error (result: 0)
Out[6]:
start_time end_time duration_millis lng lat radius
In [7]:
# end_time - start_time should equal duration_mills
print(
    'Rows with wrong duration:',
    len(df[df['duration_millis'] != (df['end_time'] - df['start_time']).dt.total_seconds() * 1000])
)
Rows with wrong duration: 780
In [8]:
# Fix duration
df['duration_millis'] = (df['end_time'] - df['start_time']).dt.total_seconds() * 1000
In [9]:
# Check for missing values
df.isnull().any(axis = 1).sum()
Out[9]:
0
In [10]:
# Radius over 70 (total: 9)
df[df['radius'] > 70]
Out[10]:
start_time end_time duration_millis lng lat radius
15 2014-07-01 11:34:35 2014-07-01 11:41:44 429000.0 25.965064 58.363173 73.522475
52 2014-07-31 19:42:00 2014-08-01 13:48:11 65171000.0 26.716240 58.375423 92.589390
72 2014-08-31 01:50:24 2014-08-31 02:24:12 2028000.0 26.717806 58.376259 72.659606
150 2014-07-11 23:49:22 2014-07-12 01:39:03 6581000.0 26.721638 58.381380 77.130602
414 2015-07-26 13:47:30 2015-07-26 14:03:37 967000.0 2.299585 48.857739 74.427139
597 2015-08-03 13:38:26 2015-08-03 13:45:36 430000.0 26.722594 58.374064 71.034769
658 2015-08-18 10:39:21 2015-08-18 11:01:16 1315000.0 26.720888 58.380411 71.856091
694 2015-07-24 18:36:04 2015-07-24 18:44:13 489000.0 2.347044 48.853571 73.398368
881 2015-08-01 17:39:26 2015-08-01 18:26:20 2814000.0 26.608733 58.378765 78.392248

Add municipalities¶

In [11]:
df['point'] = df.apply(lambda row: Point(row['lng'], row['lat']), axis=1)
df2['point'] = df2.apply(lambda row: Point(row.lng, row.lat), axis=1)

gdf_points = gpd.GeoDataFrame(df, geometry='point', crs='EPSG:4326')
gdf2_points = gpd.GeoDataFrame(df2, geometry='point', crs='EPSG:4326')

municipalities = gpd.read_file('data/municipalities.geojson')
municipalities = municipalities.to_crs("EPSG:4326")

gdf_with_municipalities = gpd.sjoin(gdf_points, municipalities, how="left", predicate="within")
df_mun = gdf_with_municipalities[[
    'start_time',
    'end_time', 
    'duration_millis',
    'lng',
    'lat',
    'radius',
    'ONIMI',
    'point']
].rename(columns={'ONIMI': 'municipality'})

gdf2_with_municipalities = gpd.sjoin(gdf2_points, municipalities, how="left", predicate="within")
df2_mun = gdf2_with_municipalities[[
    'timestamp',
    'accuracy',
    'speed',
    'altitude',
    'lng',
    'lat',
    'ONIMI',
    'point']
].rename(columns={'ONIMI': 'municipality'})
In [12]:
print(
    'Number of rows where municipality was not found: ',
    len(df_mun[df_mun['municipality'].isnull()])
)
Number of rows where municipality was not found:  74
In [13]:
print('Stops outside of Estonia:')

world = gpd.read_file('data/world.geojson')

fig, ax = plt.subplots(figsize=(15, 10))
world.plot(ax=ax, color='lightgrey', edgecolor='black')
df_mun.plot(ax=ax, color='red', markersize=5)

xmin, ymin, xmax, ymax = df_mun.total_bounds
ax.set_xlim(xmin - 1, xmax + 1)
ax.set_ylim(ymin - 1, ymax + 1)

plt.show()
Stops outside of Estonia:
No description has been provided for this image

Prepare data analysis¶

In [14]:
df_mun['start_year'] = df_mun['start_time'].dt.year
df_mun['end_year'] = df_mun['end_time'].dt.year
df_mun['start_month'] = df_mun['start_time'].dt.month
df_mun['end_month'] = df_mun['end_time'].dt.month
df_mun['start_day'] = df_mun['start_time'].dt.day
df_mun['start_day_of_week'] = df_mun['start_time'].dt.dayofweek
df_mun['start_week'] = df_mun['start_time'].dt.isocalendar().week
df_mun['end_day'] = df_mun['end_time'].dt.day
df_mun['start_hour'] = df_mun['start_time'].dt.hour
df_mun['end_hour'] = df_mun['end_time'].dt.hour

# Using metric system for distance calculations
df_mun_metric = df_mun.to_crs(epsg=3857)

df2_mun['year'] = df2_mun['timestamp'].dt.year
df2_mun['month'] = df2_mun['timestamp'].dt.month
df2_mun['day'] = df2_mun['timestamp'].dt.day
df2_mun['hour'] = df2_mun['timestamp'].dt.hour
df2_mun['time'] = df2_mun['timestamp'].dt.time
In [15]:
# Get Tartu municipality
tartu = municipalities[municipalities['ONIMI'] == 'Tartu linn']

Detect meaningful places: home, work and other meaningful locations from the data! Describe the methodology and output, visualise it (e.g. map)!¶

Using stop_gps.csv data. Using DNSCAN (auto algorithm) we crate clusters that group the most visited locations.
Using haversine function we find the neighboring points in 25m radius (approx the radius the GPS uses).
Then to establish the location we take the center point from cluster points.

We got 8 clusters:

  1. Most frequently stopped was located in Pallase art school, on top of which there is also an apartment building this is most likely the home of the person.
  2. Pepleri street 3, work 1.
  3. Kraavihall OÜ, Vaarika tn 13-1, second most likely work place
  4. Ülikooli tn 17-310 might be an institute of UT or Werner Caffe as so few visits, assume cafe
  5. Crepp cafe
  6. A guest house in Elva
  7. Summer cottage
  8. Visiting Paris


5 -> Pallase kunstikool (home); 465; 58.37451318620583,26.722370884148493
2 -> Rektangel OÜ, Pepleri tn 3; 86; 58.37574530745641,26.71774397444003
0 -> Kraavihall OÜ, Vaarika tn 13-1; 32; 58.352871786421495,26.693132724172344
7 -> Ülikooli tn 17-310 (work ??); 21; 58.38131036234592,26.720802410790206
4 -> Crepp / Trepp; 15; 58.38197287767127,26.721411302633566



3 -> Külalistemaja Vehendi Motell, Elva; 14; 58.22425014827104,26.127454574200243
1 -> Järvekalda tee 1/1, Viljandimaa (puhkekeskus); 13; 58.36297057665973,25.96458674992763

6 -> Paris; 16; 48.85431059779363,2.307358561105023
In [16]:
coords = df_mun[['lng', 'lat']].dropna().to_numpy()

# Convert degrees to radians for haversine
coords_rad = np.radians(coords)

# DBSCAN clustering with Haversine metric
kms_per_radian = 6371.0088
epsilon = 0.025 / kms_per_radian  #Neighboring radius 25 meters

db = DBSCAN(
    eps=epsilon,
    min_samples=10, # Cluster must have at lest n nearby points
    algorithm='auto',
    metric='haversine'
).fit(coords_rad)

# Assign cluster labels to the dataframe
df_mun['location_cluster'] = db.labels_
In [17]:
HOME = 5
WORK = 2
WORK2 = 0
WERNER = 7
CREPP = 4
GUEST_HOUSE = 3
SUMMER_HOUSE = 1
PARIS = 6

cluster_labels = [(5, 'Home'), (2, 'Work'), (0, 'Work'), (7, 'Werner'), (4, 'Crepp'), (3, 'Guest house'), (1, 'Summer house')]

# Collect cluster center points and labels
centers = []
labels = []

for index, label in cluster_labels:
    cluster_data = df_mun[df_mun['location_cluster'] == index]
    lng = cluster_data['lng'].mean()
    lat = cluster_data['lat'].mean()

    centers.append(Point(lng, lat))
    labels.append(label)

# Create GeoDataFrame
locations = gpd.GeoDataFrame({'label': labels}, geometry=centers, crs='EPSG:4326')
locations['coords'] = locations['geometry'].apply(lambda l: f"{l.y},{l.x}")
In [18]:
from adjustText import adjust_text
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(15, 10))
tartu.plot(ax=ax, color='lightgray', edgecolor='black')
locations[:-2].plot(ax=ax, color='red', markersize=10)

texts = []
for idx, row in locations[:-2].iterrows():
    texts.append(
        ax.text(
            row.geometry.x, row.geometry.y, row['label'],
            fontsize=14, ha='left', va='bottom', color='red',
            bbox=dict(facecolor='white', alpha=1, edgecolor='none', pad=1.5)
        )
    )
adjust_text(
    texts, ax=ax,
    expand_text=(1.3, 1.8),
    expand_points=(1.2, 1.9),
    force_text=1,
    force_points=1,
    arrowprops=dict(arrowstyle='->', color='black', lw=1)
)

plt.show()
Looks like you are using a tranform that doesn't support FancyArrowPatch, using ax.annotate instead. The arrows might strike through texts. Increasing shrinkA in arrowprops might help.
No description has been provided for this image
In [19]:
from adjustText import adjust_text
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(15, 10))
municipalities.plot(ax=ax, color='lightgray', edgecolor='black')
locations.plot(ax=ax, color='red', markersize=10)

texts = []
for idx, row in locations.iterrows():
    texts.append(
        ax.text(
            row.geometry.x, row.geometry.y, row['label'],
            fontsize=14, ha='left', va='bottom', color='red',
            bbox=dict(facecolor='white', alpha=1, edgecolor='none', pad=1.5)
        )
    )
adjust_text(
    texts, ax=ax,
    expand_text=(1.3, 1.8),
    expand_points=(1.2, 1.9),
    force_text=1,
    force_points=1,
    arrowprops=dict(arrowstyle='->', color='black', lw=1)
)

plt.show()
No description has been provided for this image

Describe the temporal patterns of visiting the city centre of Tartu!¶

In [20]:
#https://www.openstreetmap.org/relation/4572218#map=15/58.37774/26.72394
tartu_center = gpd.read_file('data/tartu_center.geojson')

print('Tartu center')
fig, ax = plt.subplots(figsize=(15, 10))
tartu_center.geometry.plot(ax=ax, color='lightgrey', edgecolor='black')
plt.show()
Tartu center
No description has been provided for this image
In [21]:
print('Stops in center per week/hours')
heatmap_data = df_mun.groupby(['start_day_of_week', 'start_hour']).size().unstack(fill_value=0)

plt.figure(figsize=(14, 6))
sns.heatmap(heatmap_data, cmap='YlOrRd', linewidths=0.5, annot=True, fmt='d')

plt.xlabel('Hour of Day')
plt.ylabel('Day of Week')
plt.yticks(ticks=[0.5 + i for i in range(7)], labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], rotation=0)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()
Stops in center per week/hours
No description has been provided for this image

How to describe the mobility (e.g. daily/weekly/monthly/seasonal mileage; transport mode [walking, cycling, public transportation, private car, etc])?¶

In [22]:
df_mun_sorted = df_mun.sort_values(by='start_time').copy()
In [23]:
df_mun_sorted['travel_time'] = df_mun_sorted['start_time'] - df_mun_sorted['end_time'].shift()
In [24]:
df_mun_sorted_3301 = df_mun_sorted.to_crs(epsg=3301)
In [25]:
df_mun_sorted_3301['prev_point'] = df_mun_sorted_3301['point'].shift()
df_mun_sorted_3301['travel_distance_m'] = df_mun_sorted_3301.distance(df_mun_sorted_3301['prev_point'])
df_mun_sorted_3301['travel_distance_km'] = df_mun_sorted_3301['travel_distance_m'] / 1000
In [26]:
df_mun_sorted_3301['travel_km_h'] = (
        df_mun_sorted_3301['travel_distance_km'] / (df_mun_sorted['travel_time'].dt.total_seconds() / 3600))
In [27]:
conditions = [
    df_mun_sorted_3301['travel_km_h'] <= 5,  # Walking
    (df_mun_sorted_3301['travel_km_h'] > 5) & (df_mun_sorted_3301['travel_km_h'] <= 15),  # Cycling
    (df_mun_sorted_3301['travel_km_h'] > 15) & (df_mun_sorted_3301['travel_km_h'] <= 40),  # Public Transport
    (df_mun_sorted_3301['travel_km_h'] > 40) & (df_mun_sorted_3301['travel_km_h'] <= 200),  # Private Car
    df_mun_sorted_3301['travel_km_h'] > 200  # Plane
]

choices = ['walking', 'cycling', 'public_transport', 'private_car', 'plane']

df_mun_sorted_3301['transport_form'] = np.select(conditions, choices, default='unknown')
In [28]:
print('Transport Form Counts by Day')

grouped = (df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
           .groupby(['start_day_of_week', 'transport_form'])
           .size()
           .unstack(fill_value=0)
        )

ax = grouped.plot(kind='bar', figsize=(10, 6))

ax.set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], rotation=0)

for container in ax.containers:
    ax.bar_label(container, label_type='edge', fontsize=8)

ax.set_xlabel('Day of week')
ax.set_ylabel('Count')
ax.legend(title='Transport Form')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Transport Form Counts by Day
No description has been provided for this image
In [29]:
print('Transport Form Counts by Week')

grouped = (df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
           .groupby(['start_week', 'transport_form'])
           .size()
           .unstack(fill_value=0)
        )

ax = grouped.plot(kind='bar', figsize=(10, 6))

ax.set_xticklabels([27, 28, 29, 30, 31, 32, 33, 34, 35, 36], rotation=0)

for container in ax.containers:
    ax.bar_label(container, label_type='edge', fontsize=8)

ax.set_xlabel('Week')
ax.set_ylabel('Count')
ax.legend(title='Transport Form')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Transport Form Counts by Week
No description has been provided for this image
In [30]:
print('Transport Form Counts by Month')

grouped = (df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
           .groupby(['start_month', 'transport_form'])
           .size()
           .unstack(fill_value=0)
        )

ax = grouped.plot(kind='bar', figsize=(10, 6))

ax.set_xticklabels(['July', 'August'], rotation=0)

for container in ax.containers:
    ax.bar_label(container, label_type='edge', fontsize=8)

ax.set_xlabel('Month')
ax.set_ylabel('Count')
ax.legend(title='Transport Form')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Transport Form Counts by Month
No description has been provided for this image
In [31]:
print('Transport Form Counts by Year')

grouped = (df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
           .groupby(['start_year', 'transport_form'])
           .size()
           .unstack(fill_value=0)
        )

ax = grouped.plot(kind='bar', figsize=(10, 6))

ax.set_xticklabels(['2014', '2015'], rotation=0)

for container in ax.containers:
    ax.bar_label(container, label_type='edge', fontsize=8)

ax.set_xlabel('Year')
ax.set_ylabel('Count')
ax.legend(title='Transport Form')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Transport Form Counts by Year
No description has been provided for this image
In [32]:
print('Mileage by Day')

ax = df_mun_sorted_3301.groupby(['start_day_of_week'])['travel_distance_km'].sum().plot(
    kind='bar',
    figsize=(10, 6)
)

ax.set_xlabel('Day of the Week')
ax.set_ylabel('Mileage (km)')
ax.set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], rotation=0)

plt.tight_layout()
plt.show()
Mileage by Day
No description has been provided for this image
In [33]:
print('Mileage by Day')

ax = (
    df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
    .groupby(['start_day_of_week', 'transport_form'])['travel_distance_km']
    .sum()
    .unstack(fill_value=0) 
    ).plot(kind='bar', stacked=True,  figsize=(12, 6))

ax.set_xlabel('Day of the Week')
ax.set_ylabel('Mileage (km)')
ax.set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], rotation=0)

plt.tight_layout()
plt.show()
Mileage by Day
No description has been provided for this image
In [34]:
print('Mileage by Day (in %)')

daily_counts = (
    df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
    .groupby(['start_day_of_week', 'transport_form'])['travel_distance_km']
    .sum()
    .unstack(fill_value=0) 
)

daily_percentages = daily_counts.div(daily_counts.sum(axis=1), axis=0) * 100

daily_percentages = daily_percentages.sort_index()

ax = daily_percentages.plot(kind='bar', stacked=True, figsize=(12, 6))

ax.set_xlabel('Day of the Week')
ax.set_ylabel('Share of Trips (%)')
ax.set_xticks(range(7))
ax.set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], rotation=0)

plt.tight_layout()
plt.show()
Mileage by Day (in %)
No description has been provided for this image
In [35]:
print('Mileage by Week')

ax = df_mun_sorted_3301.groupby(['start_week'])['travel_distance_km'].sum().plot(
    kind='bar',
    figsize=(10, 6)
)

ax.set_xlabel('Week')
ax.set_ylabel('Mileage (km)')
ax.set_xticklabels([27, 28, 29, 30, 31, 32, 33, 34, 35, 36], rotation=0)

plt.tight_layout()
plt.show()
Mileage by Week
No description has been provided for this image
In [36]:
print('Mileage by Week')

ax = (
    df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
    .groupby(['start_week', 'transport_form'])['travel_distance_km']
    .sum()
    .unstack(fill_value=0)
).plot(kind='bar', stacked=True,  figsize=(12, 6))

ax.set_xlabel('Week')
ax.set_ylabel('Mileage (km)')
ax.set_xticklabels([27, 28, 29, 30, 31, 32, 33, 34, 35, 36], rotation=0)

plt.tight_layout()
plt.show()
Mileage by Week
No description has been provided for this image
In [37]:
print('Mileage by Week (in %)')

weekly_counts = (
    df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
    .groupby(['start_week', 'transport_form'])['travel_distance_km']
    .sum()
    .unstack(fill_value=0)
)

weekly_percentages = weekly_counts.div(weekly_counts.sum(axis=1), axis=0) * 100

ax = weekly_percentages.plot(kind='bar', stacked=True, figsize=(12, 6))

ax.set_xlabel('Week')
ax.set_ylabel('Share of Trips (%)')
ax.set_xticks(range(len(weekly_percentages)))
ax.set_xticklabels(weekly_percentages.index, rotation=0)

plt.legend(title='Transport Form', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
Mileage by Week (in %)
No description has been provided for this image
In [38]:
print('Mileage by Month')

ax = df_mun_sorted_3301.groupby(['start_month'])['travel_distance_km'].sum().plot(
    kind='bar',
    figsize=(10, 6)
)

ax.set_xlabel('Month')
ax.set_ylabel('Mileage (km)')
ax.set_xticklabels(['July', 'August'], rotation=0)

plt.tight_layout()
plt.show()
Mileage by Month
No description has been provided for this image
In [39]:
print('Mileage by Month')

ax = (
    df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
    .groupby(['start_month', 'transport_form'])['travel_distance_km']
    .sum()
    .unstack(fill_value=0)
).plot(kind='bar', stacked=True,  figsize=(12, 6))

ax.set_xlabel('Month')
ax.set_ylabel('Mileage (km)')
ax.set_xticklabels(['July', 'August'], rotation=0)

plt.tight_layout()
plt.show()
Mileage by Month
No description has been provided for this image
In [57]:
print('Mileage by Monthly (in %)')

monthly_counts = (
    df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
    .groupby(['start_month', 'transport_form'])['travel_distance_km']
    .sum()
    .unstack(fill_value=0)
)

monthly_percentages = monthly_counts.div(monthly_counts.sum(axis=1), axis=0) * 100

ax = monthly_percentages.plot(kind='bar', stacked=True, figsize=(12, 6))

ax.set_xlabel('Month')
ax.set_ylabel('Share of Trips (%)')
ax.set_xticks(range(len(monthly_percentages)))
ax.set_xticklabels(monthly_percentages.index, rotation=0)

plt.legend(title='Transport Form', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
Mileage by Monthly (in %)
No description has been provided for this image
In [41]:
print('Mileage by Year')

ax = df_mun_sorted_3301.groupby(['start_year'])['travel_distance_km'].sum().plot(
    kind='bar',
    figsize=(10, 6)
)

ax.set_xlabel('Year')
ax.set_ylabel('Mileage (km)')
ax.set_xticklabels(['2014', '2015'], rotation=0)

plt.tight_layout()
plt.show()
Mileage by Year
No description has been provided for this image
In [42]:
print('Mileage by Year')

ax = (
    df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
    .groupby(['start_year', 'transport_form'])['travel_distance_km']
    .sum()
    .unstack(fill_value=0)
).plot(kind='bar', stacked=True,  figsize=(12, 6))

ax.set_xlabel('Year')
ax.set_ylabel('Mileage (km)')
ax.set_xticklabels(['2014', '2015'], rotation=0)

plt.tight_layout()
plt.show()
Mileage by Year
No description has been provided for this image
In [56]:
print('Mileage by Year (in %)')

yearly_counts = (
    df_mun_sorted_3301[df_mun_sorted_3301['transport_form'] != 'unknown']
    .groupby(['start_year', 'transport_form'])['travel_distance_km']
    .sum()
    .unstack(fill_value=0)
)

yearly_percentages = yearly_counts.div(yearly_counts.sum(axis=1), axis=0) * 100

ax = yearly_percentages.plot(kind='bar', stacked=True, figsize=(12, 6))

ax.set_xlabel('Year')
ax.set_ylabel('Share of Trips (%)')
ax.set_xticks(range(len(yearly_percentages)))
ax.set_xticklabels(yearly_percentages.index, rotation=0)

plt.legend(title='Transport Form', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
Mileage by Year (in %)
No description has been provided for this image

How has mobility changed between the year 2014 to 2015?¶

How often is the respondent out of Tartu? Can you detect regularities?¶

In [43]:
df_mun['in_tartu'] = df_mun['municipality'] == 'Tartu linn'
df2_mun['in_tartu'] = df2_mun['municipality'] == 'Tartu linn'
In [44]:
print('Points in Estonia and inside of Tartu')
fig, ax = plt.subplots(figsize=(15, 10))
municipalities.plot(ax=ax, color='lightgrey', edgecolor='black')
tartu.plot(ax=ax, color='green')
df_mun[df_mun['in_tartu'] == True].plot(ax=ax, color='red', markersize=5)

plt.show()
Points in Estonia and inside of Tartu
No description has been provided for this image
In [45]:
print('Points in Estonia and outside of Tartu')
fig, ax = plt.subplots(figsize=(15, 10))
municipalities.plot(ax=ax, color='lightgrey', edgecolor='black')
tartu.plot(ax=ax, color='green')
df_mun[(df_mun['in_tartu'] != True) & (df_mun['municipality'].isnull() == False)].plot(ax=ax, color='red', markersize=5)

plt.show()
Points in Estonia and outside of Tartu
No description has been provided for this image
In [46]:
print(
    'Stops that have different start and end year:',
    df_mun[df_mun['start_year'] != df_mun['end_year']].size
)
Stops that have different start and end year: 0
In [47]:
print('Stops:')

print(f"Stops in Tartu (2014): {len(df_mun[(df_mun['in_tartu'] == True) & (df_mun['start_year'] == 2014)])}")
print(f"Stops outside Tartu (2014): {len(df_mun[(df_mun['in_tartu'] == False) & (df_mun['start_year'] == 2014)])}")

print(f"Stops in Tartu (2015): {len(df_mun[(df_mun['in_tartu'] == True) & (df_mun['start_year'] == 2015)])}")
print(f"Stops outside Tartu (2015): {len(df_mun[(df_mun['in_tartu'] == False) & (df_mun['start_year'] == 2015)])}")
Stops:
Stops in Tartu (2014): 247
Stops outside Tartu (2014): 57
Stops in Tartu (2015): 633
Stops outside Tartu (2015): 98
In [48]:
print('Raw data:')

print(f"Activities in Tartu (2014): {len(df2_mun[(df2_mun['in_tartu'] == True) & (df2_mun['year'] == 2014)])}")
print(f"Activities outside Tartu (2014): {len(df2_mun[(df2_mun['in_tartu'] == False) & (df2_mun['year'] == 2014)])}")

print(f"Activities in Tartu (2015): {len(df2_mun[(df2_mun['in_tartu'] == True) & (df2_mun['year'] == 2015)])}")
print(f"Activities outside Tartu (2015): {len(df2_mun[(df2_mun['in_tartu'] == False) & (df2_mun['year'] == 2015)])}")
Raw data:
Activities in Tartu (2014): 66865
Activities outside Tartu (2014): 48644
Activities in Tartu (2015): 53029
Activities outside Tartu (2015): 62673
In [49]:
print('Hourly raw data per year outside of Tartu')

counts_2014 = df2_mun[(df2_mun['year'] == 2014) & (df2_mun['in_tartu'] == False)].groupby('hour').count()['timestamp']
counts_2015 = df2_mun[(df2_mun['year'] == 2015) & (df2_mun['in_tartu'] == False)].groupby('hour').count()['timestamp']

hours = list(range(24))
counts_2014 = counts_2014.reindex(hours, fill_value=0)
counts_2015 = counts_2015.reindex(hours, fill_value=0)

width = 0.4
plt.figure(figsize=(12, 6))
plt.bar([h - width/2 for h in hours], counts_2014, width=width, label='2014')
plt.bar([h + width/2 for h in hours], counts_2015, width=width, label='2015')

plt.xlabel('Hour')
plt.ylabel('Count of Raw data')
plt.xticks(hours)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
Hourly raw data per year outside of Tartu
No description has been provided for this image
In [50]:
print('Hourly raw data per month outside of Tartu')

counts_july = df2_mun[(df2_mun['month'] == 7) & (df2_mun['in_tartu'] == False)].groupby('hour').count()['timestamp']
counts_august = df2_mun[(df2_mun['month'] == 8) & (df2_mun['in_tartu'] == False)].groupby('hour').count()['timestamp']

hours = list(range(24))
counts_july = counts_july.reindex(hours, fill_value=0)
counts_august = counts_august.reindex(hours, fill_value=0)

width = 0.4
plt.figure(figsize=(12, 6))
plt.bar([h - width/2 for h in hours], counts_july, width=width, label='July')
plt.bar([h + width/2 for h in hours], counts_august, width=width, label='August')

plt.xlabel('Hour')
plt.ylabel('Count of Raw data')
plt.xticks(hours)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
Hourly raw data per month outside of Tartu
No description has been provided for this image
In [51]:
print('Hourly stop data per month outside of Tartu')

counts_july = df_mun[(df_mun['start_month'] == 7) & (df_mun['in_tartu'] == False)].groupby('start_hour').count()['start_month']
counts_august = df_mun[(df_mun['start_month'] == 8) & (df_mun['in_tartu'] == False)].groupby('start_hour').count()['start_month']

hours = list(range(24))
counts_july = counts_july.reindex(hours, fill_value=0)
counts_august = counts_august.reindex(hours, fill_value=0)

width = 0.4
plt.figure(figsize=(12, 6))
plt.bar([h - width/2 for h in hours], counts_july, width=width, label='July')
plt.bar([h + width/2 for h in hours], counts_august, width=width, label='August')

plt.xlabel('Hour')
plt.ylabel('Count of Raw data')
plt.xticks(hours)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
Hourly stop data per month outside of Tartu
No description has been provided for this image

Please describe the average daily time use of the respondent (time at home, working place, summerhouse, in movement)¶

In [52]:
import warnings
warnings.filterwarnings('ignore')
In [53]:
print('Average Time Use per Hour by Location')

cluster_map = {
    CREPP: 'Crepp',
    WERNER: 'Werner',
    HOME: 'Home',
    WORK: 'Work',
    WORK2: 'Work2'
}
df_mun['location_label'] = df_mun['location_cluster'].map(cluster_map)

hours = list(range(24))
locations = df_mun['location_label'].dropna().unique()
time_matrix = pd.DataFrame(0, index=hours, columns=locations)

for _, row in df_mun.iterrows():
    label = row['location_label']
    if pd.isna(label):
        continue

    start = int(row['start_hour'])
    end = int(row['end_hour'])
    duration = row['duration_millis'] / (1000 * 60 * 60)  # to hours

    if end < start:
        end += 24

    total_hours = end - start if end > start else 1
    hourly_share = duration / total_hours if total_hours > 0 else 0

    for h in range(start, end):
        time_matrix.at[h % 24, label] += hourly_share

plt.figure(figsize=(15, 8))
bottom = np.zeros(len(time_matrix))

for label in time_matrix.columns:
    plt.bar(time_matrix.index, time_matrix[label], bottom=bottom, label=label)
    bottom += time_matrix[label].values

plt.xlabel("Hour of Day")
plt.ylabel("Total Time Spent (Hours)")
plt.xticks(range(24))
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()
Average Time Use per Hour by Location
No description has been provided for this image
In [54]:
df_mun.dtypes
Out[54]:
start_time           datetime64[ns]
end_time             datetime64[ns]
duration_millis             float64
lng                         float64
lat                         float64
radius                      float64
municipality                 object
point                      geometry
start_year                    int32
end_year                      int32
start_month                   int32
end_month                     int32
start_day                     int32
start_day_of_week             int32
start_week                   UInt32
end_day                       int32
start_hour                    int32
end_hour                      int32
location_cluster              int64
in_tartu                       bool
location_label               object
dtype: object

Can you detect any other interesting patterns in the data?¶

In [ ]: