Data Exploration and Visualization

Data Exploration with Panda

import pandas as pd

### Read CSV Data
df = pd.read_csv("rideshare_2022_cleaned.csv")

### Show Column Info
df.info()

### Display key statistics 
df.describe()

### Read CSV with Date parsing
df = pd.read_csv("rideshare_2022_cleaned.csv", parse_dates=['trip_start_timestamp','date'])

### Display 1st five sample
df.head()

### Create new column Hour by applying lambda
df['hour'] = df['trip_start_timestamp'].apply(lambda x: x.hour)

### Group Data by hour and count each category
df.groupby(["hour"])["duration_mins"].count()

### Quickly Plot Data for Pattern
df_quick = df[df['duration_mins']< 10]
percent_quick_by_hour = df_quick.groupby(["hour"])["duration_mins"].count()/df.groupby(["hour"])["duration_mins"].count() * 100
percent_quick_by_hour.plot()

### Plot Data by Two Columns
df.plot(kind="scatter", x="fare", y="tip", marker=".")

### Categorize Data by Weekdays
df.boxplot(column='tip', by='weekday')

### Filter and Category Data by Weekday
df_fares = df[df['fare'] > 25]
df_fares.boxplot(column='fare', by='weekday')

### Box Plot
plt.figure()
df.boxplot("trip_miles")

### Box Plot by Weekday
df.boxplot(column='tip', by='weekday')

### Histogram Plot
df.hist("trip_seconds", bins=30, range=(0, 5000))

### Find Correlation Coefficient
(df_tippers.tip).corr(df_tippers.trip_miles)

Location Data Visualization via 2D Histogram

### Select Dropoff Long and Lat Columns
lng_dropoff = df.dropna()['dropoff_centroid_longitude']
lat_dropoff = df.dropna()['dropoff_centroid_latitude']

### Create 2D Histogram
fig, ax = plt.subplots(1, 1, figsize=(10, 6))
hist = ax.hist2d(lng_dropoff, lat_dropoff, bins=50, density=True)
ax.set_aspect(1.3, "box")

### A Right Side Color Bar
fig.colorbar(hist[3])
ax.set_xlabel("Longitude (degrees)")
ax.set_ylabel("Latitude (degrees)")

Location Data Visualization via Folium Map

import folium 
from folium.plugins import FastMarkerCluster

def interactive_map(df, n_samples=4000):
    
    points = df[["pickup_centroid_longitude", "pickup_centroid_latitude"]].dropna()[0:n_samples]
    
    latitude = points.iloc[0]["pickup_centroid_latitude"]
    longitude = points.iloc[0]["pickup_centroid_longitude"]
    
    map3 = folium.Map(location=[latitude, longitude], zoom_start=9)

    marker_cluster = FastMarkerCluster([]).add_to(map3)
    
    for index, row in points.iterrows():
        latitude = row["pickup_centroid_latitude"]
        longitude = row["pickup_centroid_longitude"]
        folium.Marker((latitude, longitude), icon=folium.Icon(color="green")).add_to(marker_cluster)

    return map3
    
interactive_map(df)
Want to Receive Updates On Fastest AI Models, Successful AI Startups and New Hiring Candidates. Subscribe To My Newsletters
Subscribe