Data Exploration with Panda
import pandas as pd
### Read CSV Data
df = pd.read_csv("rideshare_2022_cleaned.csv")
### Show Column Info
df.info()
### Display key statistics
df.describe()
### Read CSV with Date parsing
df = pd.read_csv("rideshare_2022_cleaned.csv", parse_dates=['trip_start_timestamp','date'])
### Display 1st five sample
df.head()
### Create new column Hour by applying lambda
df['hour'] = df['trip_start_timestamp'].apply(lambda x: x.hour)
### Group Data by hour and count each category
df.groupby(["hour"])["duration_mins"].count()
### Quickly Plot Data for Pattern
df_quick = df[df['duration_mins']< 10]
percent_quick_by_hour = df_quick.groupby(["hour"])["duration_mins"].count()/df.groupby(["hour"])["duration_mins"].count() * 100
percent_quick_by_hour.plot()
### Plot Data by Two Columns
df.plot(kind="scatter", x="fare", y="tip", marker=".")
### Categorize Data by Weekdays
df.boxplot(column='tip', by='weekday')
### Filter and Category Data by Weekday
df_fares = df[df['fare'] > 25]
df_fares.boxplot(column='fare', by='weekday')
### Box Plot
plt.figure()
df.boxplot("trip_miles")
### Box Plot by Weekday
df.boxplot(column='tip', by='weekday')
### Histogram Plot
df.hist("trip_seconds", bins=30, range=(0, 5000))
### Find Correlation Coefficient
(df_tippers.tip).corr(df_tippers.trip_miles)
Location Data Visualization via 2D Histogram
### Select Dropoff Long and Lat Columns
lng_dropoff = df.dropna()['dropoff_centroid_longitude']
lat_dropoff = df.dropna()['dropoff_centroid_latitude']
### Create 2D Histogram
fig, ax = plt.subplots(1, 1, figsize=(10, 6))
hist = ax.hist2d(lng_dropoff, lat_dropoff, bins=50, density=True)
ax.set_aspect(1.3, "box")
### A Right Side Color Bar
fig.colorbar(hist[3])
ax.set_xlabel("Longitude (degrees)")
ax.set_ylabel("Latitude (degrees)")
Location Data Visualization via Folium Map
import folium
from folium.plugins import FastMarkerCluster
def interactive_map(df, n_samples=4000):
points = df[["pickup_centroid_longitude", "pickup_centroid_latitude"]].dropna()[0:n_samples]
latitude = points.iloc[0]["pickup_centroid_latitude"]
longitude = points.iloc[0]["pickup_centroid_longitude"]
map3 = folium.Map(location=[latitude, longitude], zoom_start=9)
marker_cluster = FastMarkerCluster([]).add_to(map3)
for index, row in points.iterrows():
latitude = row["pickup_centroid_latitude"]
longitude = row["pickup_centroid_longitude"]
folium.Marker((latitude, longitude), icon=folium.Icon(color="green")).add_to(marker_cluster)
return map3
interactive_map(df)