import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
citi_bike_2020=pd.read_csv("./202106-citibike-tripdata.csv",low_memory=False)
citi_bike_2020.dropna()
citi_bike_2020.head()
ride_id | rideable_type | started_at | ended_at | start_station_name | start_station_id | end_station_name | end_station_id | start_lat | start_lng | end_lat | end_lng | member_casual | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6339C86D47EC0FAB | docked_bike | 2021-06-02 17:18:45 | 2021-06-02 17:26:17 | W 51 St & 6 Ave | 6740.10 | E 66 St & Madison Ave | 6969.08 | 40.760659 | -73.980420 | 40.768009 | -73.968453 | member |
1 | CDE3C147775B4002 | docked_bike | 2021-06-04 21:01:46 | 2021-06-04 21:32:09 | W 56 St & 10 Ave | 6955.01 | E 9 St & Avenue C | 5616.01 | 40.768254 | -73.988639 | 40.725213 | -73.977688 | member |
2 | D335C3F6B412B846 | docked_bike | 2021-06-15 16:39:36 | 2021-06-15 16:55:02 | E 56 St & Madison Ave | 6732.01 | E 16 St & 5 Ave | 6022.04 | 40.761573 | -73.972628 | 40.737262 | -73.992390 | member |
3 | 8DA6810777C89735 | docked_bike | 2021-06-08 11:36:40 | 2021-06-08 11:39:41 | E 68 St & 3 Ave | 6896.16 | 1 Ave & E 62 St | 6753.08 | 40.767128 | -73.962246 | 40.761227 | -73.960940 | member |
4 | 62D9B66A3A1D9FDF | docked_bike | 2021-06-18 12:48:43 | 2021-06-18 13:04:14 | E 48 St & 5 Ave | 6626.01 | W 20 St & 8 Ave | 6224.05 | 40.757245 | -73.978059 | 40.743453 | -74.000040 | member |
#cleaning and getting ride of massive outliers
citi_bike_2020=pd.read_csv("./202106-citibike-tripdata.csv",low_memory=False)
citi_bike_2020['started_at']=pd.to_datetime(citi_bike_2020['started_at'])
citi_bike_2020['ended_at']=pd.to_datetime(citi_bike_2020['ended_at'])
citi_bike_2020['time_of_ride']=citi_bike_2020['ended_at']-citi_bike_2020['started_at']
citi_bike_2020['time_of_ride']=round(citi_bike_2020['time_of_ride']/np.timedelta64(1,'m'))
citi_bike_2020=citi_bike_2020[citi_bike_2020['time_of_ride']>0]
#making the data only show rides that lasted a day
citi_bike_2020=citi_bike_2020[citi_bike_2020['time_of_ride']<1440]
# sorted(citi_bike_2020.time_of_ride.unique())
citi_bike_2020.groupby('end_station_name').size().sort_values(ascending=False)
citi_bike_2020.groupby('start_station_name').size().sort_values(ascending=False)
start_station_name 7 Ave & Central Park South 16586 West St & Chambers St 14781 E 17 St & Broadway 14223 W 21 St & 6 Ave 13885 Broadway & W 25 St 12696 ... Buchanan Pl & Grand Ave 36 Aqueduct Ave & North St 30 Nassau St\t& Duffield St 21 Ditmars Blvd & 43 St 16 4455.10 10 Length: 1477, dtype: int64
result=citi_bike_2020.groupby('start_station_name').size().sort_values(ascending=False)[:30]
sns.set_theme(style='darkgrid')
# .sort_values(ascending=False)[:50].plot.bar(xlabel='Station Name',ylabel='Number of rides',figsize=(10,10))
ax=sns.barplot(x=result.index,y=result.values,palette='rocket')
ax.set_xticklabels(ax.get_xticklabels(),rotation=70,ha="right",fontsize=9)
# plt.tight_layout()
plt.title('Top 30 Most Started At Stations')
plt.ylabel('Number Of Start Rides')
plt.xlabel('Station Name')
plt.show()
citi_bike_2020.start_station_name.value_counts()[:30]
7 Ave & Central Park South 16586 West St & Chambers St 14781 E 17 St & Broadway 14223 W 21 St & 6 Ave 13885 Broadway & W 25 St 12696 12 Ave & W 40 St 12169 West St & Liberty St 11981 1 Ave & E 68 St 11828 Cleveland Pl & Spring St 11682 Central Park S & 6 Ave 11331 Carmine St & 6 Ave 11141 8 Ave & W 52 St 11065 10 Ave & W 14 St 10856 Pershing Square North 10495 Broadway & E 14 St 10435 6 Ave & W 33 St 10359 E 13 St & Avenue A 10326 W 22 St & 10 Ave 10225 W 63 St & Broadway 10173 Pier 40 - Hudson River Park 10065 E 33 St & 1 Ave 10051 W 30 St & 10 Ave 9707 9 Ave & W 22 St 9691 Grand St & Elizabeth St 9663 11 Ave & W 41 St 9647 Grand Army Plaza & Central Park S 9575 6 Ave & W 34 St 9454 Broadway & W 51 St 9431 W 4 St & 7 Ave S 9355 E 10 St & Avenue A 9258 Name: start_station_name, dtype: int64
result=citi_bike_2020.groupby('end_station_name').size().sort_values(ascending=False)[:30]
sns.set_theme(style='darkgrid')
# .sort_values(ascending=False)[:50].plot.bar(xlabel='Station Name',ylabel='Number of rides',figsize=(10,10))
ax=sns.barplot(x=result.index,y=result.values,palette="ch:s=-.7,r=.64")
ax.set_xticklabels(ax.get_xticklabels(),rotation=70,ha="right",fontsize=9)
# plt.tight_layout()
plt.title('Top 30 Most Ended At Stations')
plt.ylabel('Number Of Start Rides')
plt.xlabel('Station Name')
plt.show()
citi_bike_2020.end_station_name.value_counts()[:30]
7 Ave & Central Park South 16582 West St & Chambers St 14828 E 17 St & Broadway 14392 W 21 St & 6 Ave 13922 Broadway & W 25 St 12735 12 Ave & W 40 St 12258 West St & Liberty St 11977 1 Ave & E 68 St 11939 Cleveland Pl & Spring St 11709 Central Park S & 6 Ave 11231 Carmine St & 6 Ave 11159 10 Ave & W 14 St 11138 8 Ave & W 52 St 11085 Pershing Square North 10464 W 22 St & 10 Ave 10423 Broadway & E 14 St 10373 6 Ave & W 33 St 10343 E 13 St & Avenue A 10311 W 63 St & Broadway 10183 Pier 40 - Hudson River Park 10131 E 33 St & 1 Ave 10092 Grand St & Elizabeth St 9751 9 Ave & W 22 St 9691 W 30 St & 10 Ave 9675 11 Ave & W 41 St 9646 Grand Army Plaza & Central Park S 9531 E 10 St & Avenue A 9369 Broadway & W 51 St 9359 W 4 St & 7 Ave S 9314 Greenwich Ave & 8 Ave 9295 Name: end_station_name, dtype: int64
citi_bike_2020['day_of_week']=pd.to_datetime(citi_bike_2020['started_at']).dt.weekday
result=citi_bike_2020.groupby('day_of_week').size()
ax=sns.barplot(x=result.index,y=result.values,palette="ch:s=-.7,r=.64")
plt.show()
result
day_of_week 0 378414 1 467023 2 554289 3 428176 4 429312 5 459096 6 420981 dtype: int64