Source code for pyehicle.preprocessing.segmentation

"""
Trajectory segmentation module for pyehicle.

This module provides functions to split GPS trajectories into segments based on
temporal gaps. Useful for separating distinct trips or identifying breaks in
continuous tracking.
"""

import numpy as np
import pandas as pd
import polars as pl


[docs] def by_time( df: pd.DataFrame | pl.DataFrame, time_threshold: float = 30, length_threshold: int = 20, time_col: str = 'time' ) -> pd.DataFrame | pl.DataFrame | list: """ Segment a trajectory into sub-trajectories based on time gaps between consecutive points. This function splits a trajectory whenever there is a time gap larger than the specified threshold. Only segments longer than the length threshold are kept. This is useful for: - Separating distinct trips recorded in a single file - Identifying breaks in continuous GPS tracking - Removing short, fragmented trajectory segments The function preserves all original columns and maintains the temporal order within segments. Parameters ---------- df : pd.DataFrame or pl.DataFrame The trajectory DataFrame to segment. Must contain a time column. time_threshold : float, default=30 Maximum time gap in seconds. Gaps larger than this will cause a split. Common values: - 30s: For removing brief GPS signal losses - 300s (5 min): For separating distinct trips - 3600s (1 hour): For separating different days length_threshold : int, default=20 Minimum number of points for a segment to be kept. Segments with fewer points are discarded. Helps filter out very short, potentially noisy segments. time_col : str, default='time' Name of the time column. Must be parseable by pandas.to_datetime(). Returns ------- pd.DataFrame, pl.DataFrame, or list The return type depends on the number of valid segments found: - If only 1 valid segment: Returns a single DataFrame (same type as input) - If multiple valid segments: Returns a list of DataFrames - If no valid segments (all too short): Returns the original DataFrame unchanged - If input has ≤ 1 row: Returns the original DataFrame unchanged Examples -------- >>> import pandas as pd >>> import pyehicle as pye >>> >>> # Load a trajectory with multiple trips >>> df = pd.read_csv('full_day_gps.csv') >>> print(f"Total points: {len(df)}") >>> >>> # Split at 10-minute gaps, keep segments with 50+ points >>> segments = pye.preprocessing.by_time( ... df, ... time_threshold=600, # 10 minutes ... length_threshold=50 ... ) >>> >>> # Check results >>> if isinstance(segments, list): ... print(f"Found {len(segments)} trips") ... for i, seg in enumerate(segments): ... print(f"Trip {i+1}: {len(seg)} points") ... else: ... print(f"Single continuous trajectory: {len(segments)} points") >>> >>> # Process each segment separately >>> if isinstance(segments, list): ... for i, segment in enumerate(segments): ... # Apply preprocessing to each trip ... compressed = pye.preprocessing.spatio_temporal_compress(segment) ... matched = pye.preprocessing.leuven(compressed) ... matched.to_csv(f'trip_{i+1}_matched.csv') Notes ----- **Algorithm:** 1. Calculate time differences between consecutive points 2. Identify split points where time_diff > time_threshold 3. Split trajectory at these points 4. Filter out segments with length ≤ length_threshold 5. Return results based on number of valid segments **Performance:** - Time complexity: O(n) where n is the number of points - Memory: O(n) for creating new DataFrames - Fast for all trajectory sizes **Edge Cases:** - Empty or single-point input: Returns original DataFrame - All segments too short: Returns original DataFrame - Time column not sorted: Results may be unexpected (assumes chronological order) **Use Cases:** - Preprocessing multi-day GPS logs - Separating work commutes from personal trips - Removing GPS signal loss periods - Batch processing distinct trajectory segments """ # Handle edge cases: empty or very short trajectories if len(df) <= 1: return df if len(df) <= length_threshold: return df # Create pandas Timedelta object for threshold comparison td = pd.Timedelta(time_threshold, unit='s') # Process pandas DataFrame if isinstance(df, pd.DataFrame): # Extract timestamps for gap analysis time_values = df[time_col].to_numpy() # Calculate time differences between consecutive points # Note: np.diff returns array of length n-1 time_diffs = np.diff(time_values) # Find all points where the time gap exceeds the threshold # Add 1 to indices because diff() reduces array length by 1 split_indices = np.where(time_diffs > td)[0] + 1 trajectories_list = [] # Build segments by splitting at the identified gap points start_idx = 0 for split_idx in split_indices: segment_length = split_idx - start_idx # Only keep segments that meet the length threshold if segment_length > length_threshold: temp_df = df.iloc[start_idx:split_idx].copy().reset_index(drop=True) trajectories_list.append(temp_df) start_idx = split_idx # Handle the final segment (from last split to end) final_segment_length = len(df) - start_idx if final_segment_length > length_threshold: temp_df = df.iloc[start_idx:].copy().reset_index(drop=True) trajectories_list.append(temp_df) else: # polars DataFrame # Extract timestamps for gap analysis time_values = df[time_col].to_numpy() # Calculate time differences between consecutive points time_diffs = np.diff(time_values) # Find all points where the time gap exceeds the threshold split_indices = np.where(time_diffs > td)[0] + 1 trajectories_list = [] # Build segments by splitting at the identified gap points start_idx = 0 for split_idx in split_indices: segment_length = int(split_idx) - start_idx # Only keep segments that meet the length threshold if segment_length > length_threshold: temp_df = df[start_idx:int(split_idx)].clone() trajectories_list.append(temp_df) start_idx = int(split_idx) # Handle the final segment (from last split to end) final_segment_length = len(df) - start_idx if final_segment_length > length_threshold: temp_df = df[start_idx:].clone() trajectories_list.append(temp_df) # Return results based on number of valid segments found if not trajectories_list: # No valid segments (all too short) - return original return df elif len(trajectories_list) == 1: # Single valid segment - return as DataFrame return trajectories_list[0] else: # Multiple valid segments - return as list return trajectories_list