import ibis
= True
ibis.options.interactive
#from trino.auth import OAuth2Authentication
This notebook example involves using a managed version of Trino (Starburst). It will work without Starburst provided you are able to import data into a Trino cluster connected to a lake. We will be using one month of Yellow Taxi data from https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page and a zone look up file provided on the same page. Please download both files and register in either your Starburst or Trino cluster before proceeding.
IMPORTANT!!!! Change your user, host, port, database, schema and roles to be relevant to your Starburst Galaxy setup. If you are using OAuth2, uncomment the keyword lines roles, and auth. Then comment PASSWORD to proceed. You can reference: https://ibis-project.org/backends/trino#connecting-to-starburst-managed-trino-instances for more information.
import os
= ibis.trino.connect(
con =os.environ['user'],
user=os.environ['host'],
host=os.environ['password'],
password=443,
port=os.environ['database'],
database=os.environ['schema'],
schema#roles="accountadmin",
#auth=OAuth2Authentication(),
="https"
http_scheme )
Within Ibis con.list_tables() allows us to list all the tables.
con.list_tables()
['groupbyboroughtrips', 'taxizonenyc', 'zonelookup']
Ibis tables in trino can be referenced through the use of con.table. We’re going to create two reference ibis tables to Trino from our tables below:
= con.table("taxizonenyc")
nycjantrips = con.table("zonelookup") zonelookup
In Ibis we can check the schema of the tables we just referenced through .schema()
nycjantrips.schema()
ibis.Schema {
vendorid int64
tpep_pickup_datetime timestamp(3)
tpep_dropoff_datetime timestamp(3)
passenger_count float64
trip_distance float64
ratecodeid float64
store_and_fwd_flag string
pulocationid int64
dolocationid int64
payment_type int64
fare_amount float64
extra float64
mta_tax float64
tip_amount float64
tolls_amount float64
improvement_surcharge float64
total_amount float64
congestion_surcharge float64
airport_fee float64
}
zonelookup.schema()
ibis.Schema {
locationid string
borough string
zone string
service_zone string
}
We’re going to preview the dataset with ibis slice method. We can see the first 10 rows here. We also included ibis.options.interactive = True at the start of our notebook which allows us to display the ibis tables in a prettified way.
0:10] nycjantrips[
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ ┃ vendorid ┃ tpep_pickup_datetime ┃ tpep_dropoff_datetime ┃ passenger_count ┃ trip_distance ┃ ratecodeid ┃ store_and_fwd_flag ┃ pulocationid ┃ dolocationid ┃ payment_type ┃ fare_amount ┃ extra ┃ mta_tax ┃ tip_amount ┃ tolls_amount ┃ improvement_surcharge ┃ total_amount ┃ congestion_surcharge ┃ airport_fee ┃ ┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ │ int64 │ timestamp(3) │ timestamp(3) │ float64 │ float64 │ float64 │ string │ int64 │ int64 │ int64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ ├──────────┼──────────────────────┼───────────────────────┼─────────────────┼───────────────┼────────────┼────────────────────┼──────────────┼──────────────┼──────────────┼─────────────┼─────────┼─────────┼────────────┼──────────────┼───────────────────────┼──────────────┼──────────────────────┼─────────────┤ │ 2 │ 2023-01-01 00:32:10 │ 2023-01-01 00:40:36 │ 1.0 │ 0.97 │ 1.0 │ N │ 161 │ 141 │ 2 │ 9.3 │ 1.00 │ 0.5 │ 0.00 │ 0.0 │ 1.0 │ 14.30 │ 2.5 │ 0.00 │ │ 2 │ 2023-01-01 00:55:08 │ 2023-01-01 01:01:27 │ 1.0 │ 1.10 │ 1.0 │ N │ 43 │ 237 │ 1 │ 7.9 │ 1.00 │ 0.5 │ 4.00 │ 0.0 │ 1.0 │ 16.90 │ 2.5 │ 0.00 │ │ 2 │ 2023-01-01 00:25:04 │ 2023-01-01 00:37:49 │ 1.0 │ 2.51 │ 1.0 │ N │ 48 │ 238 │ 1 │ 14.9 │ 1.00 │ 0.5 │ 15.00 │ 0.0 │ 1.0 │ 34.90 │ 2.5 │ 0.00 │ │ 1 │ 2023-01-01 00:03:48 │ 2023-01-01 00:13:25 │ 0.0 │ 1.90 │ 1.0 │ N │ 138 │ 7 │ 1 │ 12.1 │ 7.25 │ 0.5 │ 0.00 │ 0.0 │ 1.0 │ 20.85 │ 0.0 │ 1.25 │ │ 2 │ 2023-01-01 00:10:29 │ 2023-01-01 00:21:19 │ 1.0 │ 1.43 │ 1.0 │ N │ 107 │ 79 │ 1 │ 11.4 │ 1.00 │ 0.5 │ 3.28 │ 0.0 │ 1.0 │ 19.68 │ 2.5 │ 0.00 │ │ 2 │ 2023-01-01 00:50:34 │ 2023-01-01 01:02:52 │ 1.0 │ 1.84 │ 1.0 │ N │ 161 │ 137 │ 1 │ 12.8 │ 1.00 │ 0.5 │ 10.00 │ 0.0 │ 1.0 │ 27.80 │ 2.5 │ 0.00 │ │ 2 │ 2023-01-01 00:09:22 │ 2023-01-01 00:19:49 │ 1.0 │ 1.66 │ 1.0 │ N │ 239 │ 143 │ 1 │ 12.1 │ 1.00 │ 0.5 │ 3.42 │ 0.0 │ 1.0 │ 20.52 │ 2.5 │ 0.00 │ │ 2 │ 2023-01-01 00:27:12 │ 2023-01-01 00:49:56 │ 1.0 │ 11.70 │ 1.0 │ N │ 142 │ 200 │ 1 │ 45.7 │ 1.00 │ 0.5 │ 10.74 │ 3.0 │ 1.0 │ 64.44 │ 2.5 │ 0.00 │ │ 2 │ 2023-01-01 00:21:44 │ 2023-01-01 00:36:40 │ 1.0 │ 2.95 │ 1.0 │ N │ 164 │ 236 │ 1 │ 17.7 │ 1.00 │ 0.5 │ 5.68 │ 0.0 │ 1.0 │ 28.38 │ 2.5 │ 0.00 │ │ 2 │ 2023-01-01 00:39:42 │ 2023-01-01 00:50:36 │ 1.0 │ 3.01 │ 1.0 │ N │ 141 │ 107 │ 2 │ 14.9 │ 1.00 │ 0.5 │ 0.00 │ 0.0 │ 1.0 │ 19.90 │ 2.5 │ 0.00 │ └──────────┴──────────────────────┴───────────────────────┴─────────────────┴───────────────┴────────────┴────────────────────┴──────────────┴──────────────┴──────────────┴─────────────┴─────────┴─────────┴────────────┴──────────────┴───────────────────────┴──────────────┴──────────────────────┴─────────────┘
To understand the dataset a little more we can try an order by. Looks like there are some columns with passenger count of undefined. In this case we’re going to want to curate the dataset and clean it up a bit to ensure more accurate data.
nycjantrips.order_by(nycjantrips.trip_distance.desc())
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ ┃ vendorid ┃ tpep_pickup_datetime ┃ tpep_dropoff_datetime ┃ passenger_count ┃ trip_distance ┃ ratecodeid ┃ store_and_fwd_flag ┃ pulocationid ┃ dolocationid ┃ payment_type ┃ fare_amount ┃ extra ┃ mta_tax ┃ tip_amount ┃ tolls_amount ┃ improvement_surcharge ┃ total_amount ┃ congestion_surcharge ┃ airport_fee ┃ ┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ │ int64 │ timestamp(3) │ timestamp(3) │ float64 │ float64 │ float64 │ string │ int64 │ int64 │ int64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ ├──────────┼──────────────────────┼───────────────────────┼─────────────────┼───────────────┼────────────┼────────────────────┼──────────────┼──────────────┼──────────────┼─────────────┼─────────┼─────────┼────────────┼──────────────┼───────────────────────┼──────────────┼──────────────────────┼─────────────┤ │ 2 │ 2023-01-23 17:14:00 │ 2023-01-23 17:20:00 │ NULL │ 258928.15 │ NULL │ NULL │ 238 │ 239 │ 0 │ 13.65 │ 0.0 │ 0.5 │ 3.53 │ 0.0 │ 1.0 │ 21.18 │ NULL │ NULL │ │ 2 │ 2023-01-07 18:18:00 │ 2023-01-07 18:47:00 │ NULL │ 225987.37 │ NULL │ NULL │ 261 │ 263 │ 0 │ 56.49 │ 0.0 │ 0.5 │ 12.10 │ 0.0 │ 1.0 │ 72.59 │ NULL │ NULL │ │ 2 │ 2023-01-15 08:33:00 │ 2023-01-15 08:43:00 │ NULL │ 187872.33 │ NULL │ NULL │ 229 │ 142 │ 0 │ 14.02 │ 0.0 │ 0.5 │ 3.60 │ 0.0 │ 1.0 │ 21.62 │ NULL │ NULL │ │ 2 │ 2023-01-13 05:53:00 │ 2023-01-13 06:02:00 │ NULL │ 116439.71 │ NULL │ NULL │ 239 │ 230 │ 0 │ 13.22 │ 0.0 │ 0.5 │ 3.00 │ 0.0 │ 1.0 │ 20.22 │ NULL │ NULL │ │ 2 │ 2023-01-31 07:18:00 │ 2023-01-31 07:57:00 │ NULL │ 85543.66 │ NULL │ NULL │ 41 │ 87 │ 0 │ 39.90 │ 0.0 │ 0.5 │ 7.90 │ 0.0 │ 1.0 │ 51.80 │ NULL │ NULL │ │ 2 │ 2023-01-12 08:45:00 │ 2023-01-12 09:09:00 │ NULL │ 76886.52 │ NULL │ NULL │ 148 │ 68 │ 0 │ 21.49 │ 0.0 │ 0.5 │ 5.10 │ 0.0 │ 1.0 │ 30.59 │ NULL │ NULL │ │ 2 │ 2023-01-01 19:40:43 │ 2023-01-01 20:21:48 │ 1.0 │ 62359.52 │ 4.0 │ N │ 79 │ 48 │ 2 │ 0.00 │ 0.0 │ 0.0 │ 0.00 │ 0.0 │ 0.0 │ 2.50 │ 2.5 │ 0.0 │ │ 2 │ 2023-01-31 21:24:00 │ 2023-01-31 21:28:00 │ NULL │ 52042.30 │ NULL │ NULL │ 236 │ 75 │ 0 │ 14.40 │ 0.0 │ 0.5 │ 1.84 │ 0.0 │ 1.0 │ 20.24 │ NULL │ NULL │ │ 2 │ 2023-01-22 11:25:00 │ 2023-01-22 11:33:00 │ NULL │ 33205.32 │ NULL │ NULL │ 230 │ 43 │ 0 │ 12.87 │ 0.0 │ 0.5 │ 3.37 │ 0.0 │ 1.0 │ 20.24 │ NULL │ NULL │ │ 2 │ 2023-01-23 07:29:00 │ 2023-01-23 07:51:00 │ NULL │ 16562.61 │ NULL │ NULL │ 262 │ 90 │ 0 │ 28.87 │ 0.0 │ 0.5 │ 4.93 │ 0.0 │ 1.0 │ 37.80 │ NULL │ NULL │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └──────────┴──────────────────────┴───────────────────────┴─────────────────┴───────────────┴────────────┴────────────────────┴──────────────┴──────────────┴──────────────┴─────────────┴─────────┴─────────┴────────────┴──────────────┴───────────────────────┴──────────────┴──────────────────────┴─────────────┘
We can chain together expressions with filter - similar to a WHERE clause in SQL. We can see nan (not a number) involved, ibis also has built-in support for that.
= nycjantrips.filter((nycjantrips.passenger_count != 0) | (nycjantrips.passenger_count.isnan() is False))
nyc_filtered nyc_filtered
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ ┃ vendorid ┃ tpep_pickup_datetime ┃ tpep_dropoff_datetime ┃ passenger_count ┃ trip_distance ┃ ratecodeid ┃ store_and_fwd_flag ┃ pulocationid ┃ dolocationid ┃ payment_type ┃ fare_amount ┃ extra ┃ mta_tax ┃ tip_amount ┃ tolls_amount ┃ improvement_surcharge ┃ total_amount ┃ congestion_surcharge ┃ airport_fee ┃ ┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ │ int64 │ timestamp(3) │ timestamp(3) │ float64 │ float64 │ float64 │ string │ int64 │ int64 │ int64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ ├──────────┼──────────────────────┼───────────────────────┼─────────────────┼───────────────┼────────────┼────────────────────┼──────────────┼──────────────┼──────────────┼─────────────┼─────────┼─────────┼────────────┼──────────────┼───────────────────────┼──────────────┼──────────────────────┼─────────────┤ │ 2 │ 2023-01-01 00:32:10 │ 2023-01-01 00:40:36 │ 1.0 │ 0.97 │ 1.0 │ N │ 161 │ 141 │ 2 │ 9.3 │ 1.0 │ 0.5 │ 0.00 │ 0.0 │ 1.0 │ 14.30 │ 2.5 │ 0.0 │ │ 2 │ 2023-01-01 00:55:08 │ 2023-01-01 01:01:27 │ 1.0 │ 1.10 │ 1.0 │ N │ 43 │ 237 │ 1 │ 7.9 │ 1.0 │ 0.5 │ 4.00 │ 0.0 │ 1.0 │ 16.90 │ 2.5 │ 0.0 │ │ 2 │ 2023-01-01 00:25:04 │ 2023-01-01 00:37:49 │ 1.0 │ 2.51 │ 1.0 │ N │ 48 │ 238 │ 1 │ 14.9 │ 1.0 │ 0.5 │ 15.00 │ 0.0 │ 1.0 │ 34.90 │ 2.5 │ 0.0 │ │ 2 │ 2023-01-01 00:10:29 │ 2023-01-01 00:21:19 │ 1.0 │ 1.43 │ 1.0 │ N │ 107 │ 79 │ 1 │ 11.4 │ 1.0 │ 0.5 │ 3.28 │ 0.0 │ 1.0 │ 19.68 │ 2.5 │ 0.0 │ │ 2 │ 2023-01-01 00:50:34 │ 2023-01-01 01:02:52 │ 1.0 │ 1.84 │ 1.0 │ N │ 161 │ 137 │ 1 │ 12.8 │ 1.0 │ 0.5 │ 10.00 │ 0.0 │ 1.0 │ 27.80 │ 2.5 │ 0.0 │ │ 2 │ 2023-01-01 00:09:22 │ 2023-01-01 00:19:49 │ 1.0 │ 1.66 │ 1.0 │ N │ 239 │ 143 │ 1 │ 12.1 │ 1.0 │ 0.5 │ 3.42 │ 0.0 │ 1.0 │ 20.52 │ 2.5 │ 0.0 │ │ 2 │ 2023-01-01 00:27:12 │ 2023-01-01 00:49:56 │ 1.0 │ 11.70 │ 1.0 │ N │ 142 │ 200 │ 1 │ 45.7 │ 1.0 │ 0.5 │ 10.74 │ 3.0 │ 1.0 │ 64.44 │ 2.5 │ 0.0 │ │ 2 │ 2023-01-01 00:21:44 │ 2023-01-01 00:36:40 │ 1.0 │ 2.95 │ 1.0 │ N │ 164 │ 236 │ 1 │ 17.7 │ 1.0 │ 0.5 │ 5.68 │ 0.0 │ 1.0 │ 28.38 │ 2.5 │ 0.0 │ │ 2 │ 2023-01-01 00:39:42 │ 2023-01-01 00:50:36 │ 1.0 │ 3.01 │ 1.0 │ N │ 141 │ 107 │ 2 │ 14.9 │ 1.0 │ 0.5 │ 0.00 │ 0.0 │ 1.0 │ 19.90 │ 2.5 │ 0.0 │ │ 2 │ 2023-01-01 00:53:01 │ 2023-01-01 01:01:45 │ 1.0 │ 1.80 │ 1.0 │ N │ 234 │ 68 │ 1 │ 11.4 │ 1.0 │ 0.5 │ 3.28 │ 0.0 │ 1.0 │ 19.68 │ 2.5 │ 0.0 │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └──────────┴──────────────────────┴───────────────────────┴─────────────────┴───────────────┴────────────┴────────────────────┴──────────────┴──────────────┴──────────────┴─────────────┴─────────┴─────────┴────────────┴──────────────┴───────────────────────┴──────────────┴──────────────────────┴─────────────┘
You can see with the command below that nan has been filtered out!
nyc_filtered.order_by(nyc_filtered.trip_distance.desc())
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ ┃ vendorid ┃ tpep_pickup_datetime ┃ tpep_dropoff_datetime ┃ passenger_count ┃ trip_distance ┃ ratecodeid ┃ store_and_fwd_flag ┃ pulocationid ┃ dolocationid ┃ payment_type ┃ fare_amount ┃ extra ┃ mta_tax ┃ tip_amount ┃ tolls_amount ┃ improvement_surcharge ┃ total_amount ┃ congestion_surcharge ┃ airport_fee ┃ ┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ │ int64 │ timestamp(3) │ timestamp(3) │ float64 │ float64 │ float64 │ string │ int64 │ int64 │ int64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ ├──────────┼──────────────────────┼───────────────────────┼─────────────────┼───────────────┼────────────┼────────────────────┼──────────────┼──────────────┼──────────────┼─────────────┼─────────┼─────────┼────────────┼──────────────┼───────────────────────┼──────────────┼──────────────────────┼─────────────┤ │ 2 │ 2023-01-01 19:40:43 │ 2023-01-01 20:21:48 │ 1.0 │ 62359.52 │ 4.0 │ N │ 79 │ 48 │ 2 │ 0.0 │ 0.0 │ 0.0 │ 0.00 │ 0.00 │ 0.0 │ 2.50 │ 2.5 │ 0.00 │ │ 2 │ 2023-01-31 15:53:23 │ 2023-01-31 16:50:38 │ 2.0 │ 14098.55 │ 2.0 │ N │ 142 │ 132 │ 1 │ 70.0 │ 0.0 │ 0.5 │ 0.00 │ 0.00 │ 1.0 │ 74.00 │ 2.5 │ 0.00 │ │ 2 │ 2023-01-23 08:18:50 │ 2023-01-23 08:47:22 │ 1.0 │ 9947.03 │ 1.0 │ N │ 239 │ 140 │ 1 │ 23.3 │ 0.0 │ 0.5 │ 1.91 │ 0.00 │ 1.0 │ 29.21 │ 2.5 │ 0.00 │ │ 2 │ 2023-01-26 15:20:46 │ 2023-01-26 16:01:48 │ 1.0 │ 9684.00 │ 3.0 │ N │ 164 │ 265 │ 1 │ 97.9 │ 0.0 │ 0.0 │ 24.98 │ 26.00 │ 1.0 │ 149.88 │ 0.0 │ 0.00 │ │ 2 │ 2023-01-18 19:37:41 │ 2023-01-18 20:14:29 │ 1.0 │ 9683.76 │ 1.0 │ N │ 264 │ 112 │ 1 │ 63.9 │ 2.5 │ 0.5 │ 5.00 │ 0.00 │ 1.0 │ 74.15 │ 0.0 │ 1.25 │ │ 2 │ 2023-01-03 15:25:05 │ 2023-01-03 16:09:02 │ 1.0 │ 9680.61 │ 1.0 │ N │ 264 │ 244 │ 1 │ 47.1 │ 0.0 │ 0.5 │ 10.22 │ 0.00 │ 1.0 │ 61.32 │ 2.5 │ 0.00 │ │ 2 │ 2023-01-18 19:43:55 │ 2023-01-18 20:02:33 │ 1.0 │ 9679.83 │ 1.0 │ N │ 264 │ 49 │ 1 │ 35.2 │ 7.5 │ 0.5 │ 3.00 │ 0.00 │ 1.0 │ 48.45 │ 0.0 │ 1.25 │ │ 2 │ 2023-01-05 15:26:43 │ 2023-01-05 16:04:17 │ 1.0 │ 9679.73 │ 1.0 │ N │ 264 │ 244 │ 1 │ 47.1 │ 5.0 │ 0.5 │ 12.53 │ 6.55 │ 1.0 │ 76.43 │ 2.5 │ 1.25 │ │ 2 │ 2023-01-19 19:36:58 │ 2023-01-19 20:01:44 │ 1.0 │ 9675.57 │ 1.0 │ N │ 264 │ 151 │ 1 │ 24.0 │ 2.5 │ 0.5 │ 1.58 │ 0.00 │ 1.0 │ 32.08 │ 2.5 │ 0.00 │ │ 2 │ 2023-01-09 15:44:58 │ 2023-01-09 16:02:35 │ 1.0 │ 9674.67 │ 1.0 │ N │ 264 │ 239 │ 2 │ 17.7 │ 0.0 │ 0.5 │ 0.00 │ 0.00 │ 1.0 │ 19.20 │ 0.0 │ 0.00 │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └──────────┴──────────────────────┴───────────────────────┴─────────────────┴───────────────┴────────────┴────────────────────┴──────────────┴──────────────┴──────────────┴─────────────┴─────────┴─────────┴────────────┴──────────────┴───────────────────────┴──────────────┴──────────────────────┴─────────────┘
Let’s add a column to our dataset. I want to add a column to help calculate the average ride duration. We are going to use the Ibis ‘Delta’ function for this result Ibis is also pretty cool and can simply visualize a column in isolation:
= nyc_filtered.tpep_dropoff_datetime.delta(nyc_filtered.tpep_pickup_datetime, "minute").name("rideminutes")
ride_duration ride_duration
┏━━━━━━━━━━━━━┓ ┃ rideminutes ┃ ┡━━━━━━━━━━━━━┩ │ int64 │ ├─────────────┤ │ 8 │ │ 6 │ │ 12 │ │ 11 │ │ 12 │ │ 10 │ │ 22 │ │ 15 │ │ 11 │ │ 8 │ │ … │ └─────────────┘
We can also combine the column with our original table using the ‘mutate’ method shown here.
= nyc_filtered.mutate(rideminutes=nyc_filtered.tpep_dropoff_datetime.delta(nyc_filtered.tpep_pickup_datetime, "minute"))
nycjanduration "vendorid","rideminutes","trip_distance"] nycjanduration[
┏━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ vendorid ┃ rideminutes ┃ trip_distance ┃ ┡━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ int64 │ int64 │ float64 │ ├──────────┼─────────────┼───────────────┤ │ 2 │ 8 │ 0.97 │ │ 2 │ 6 │ 1.10 │ │ 2 │ 12 │ 2.51 │ │ 2 │ 11 │ 1.43 │ │ 2 │ 12 │ 1.84 │ │ 2 │ 10 │ 1.66 │ │ 2 │ 22 │ 11.70 │ │ 2 │ 15 │ 2.95 │ │ 2 │ 11 │ 3.01 │ │ 2 │ 8 │ 1.80 │ │ … │ … │ … │ └──────────┴─────────────┴───────────────┘
"vendorid","rideminutes","trip_distance"].head(3) nycjanduration[
┏━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ vendorid ┃ rideminutes ┃ trip_distance ┃ ┡━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ int64 │ int64 │ float64 │ ├──────────┼─────────────┼───────────────┤ │ 2 │ 8 │ 0.97 │ │ 2 │ 6 │ 1.10 │ │ 2 │ 12 │ 2.51 │ └──────────┴─────────────┴───────────────┘
Next up are some basic analytics and aggregations in Ibis - let’s get total revenue with sum(), longest trip with max(), and average trip duration with mean(). Ibis is able to chain expressions similar to pandas.
#some basic analytics - let's get total revenue, longest trip.
= nycjanduration.agg(
insights
["total_trips"),
ibis._.count().name("total_amount"].sum().name("total_revenue"),
ibis._["trip_distance"].sum().name("total_distance_all"),
ibis._["rideminutes"].max().name("longest"),
ibis._["rideminutes"].mean().round(2).name("average_ride")
ibis._[
]
) insights
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓ ┃ total_trips ┃ total_revenue ┃ total_distance_all ┃ longest ┃ average_ride ┃ ┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩ │ int64 │ float64 │ float64 │ int64 │ float64 │ ├─────────────┼───────────────┼────────────────────┼─────────┼──────────────┤ │ 2943859 │ 7.953883e+07 │ 1.015018e+07 │ 10030 │ 15.69 │ └─────────────┴───────────────┴────────────────────┴─────────┴──────────────┘
Wait, the longest trip seems a bit… lengthy… Note: we added a .round function to display the average ride more nicely. Let’s check out the ride itself.
filter(nycjanduration["rideminutes"] == 10030) nycjanduration.
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ ┃ vendorid ┃ tpep_pickup_datetime ┃ tpep_dropoff_datetime ┃ passenger_count ┃ trip_distance ┃ ratecodeid ┃ store_and_fwd_flag ┃ pulocationid ┃ dolocationid ┃ payment_type ┃ fare_amount ┃ extra ┃ mta_tax ┃ tip_amount ┃ tolls_amount ┃ improvement_surcharge ┃ total_amount ┃ congestion_surcharge ┃ airport_fee ┃ rideminutes ┃ ┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ │ int64 │ timestamp(3) │ timestamp(3) │ float64 │ float64 │ float64 │ string │ int64 │ int64 │ int64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ int64 │ ├──────────┼──────────────────────┼───────────────────────┼─────────────────┼───────────────┼────────────┼────────────────────┼──────────────┼──────────────┼──────────────┼─────────────┼─────────┼─────────┼────────────┼──────────────┼───────────────────────┼──────────────┼──────────────────────┼─────────────┼─────────────┤ │ 2 │ 2023-01-23 11:21:51 │ 2023-01-30 10:31:02 │ 1.0 │ 0.0 │ 1.0 │ N │ 207 │ 95 │ 2 │ 3.0 │ 0.0 │ 0.5 │ 0.0 │ 0.0 │ 1.0 │ 4.5 │ 0.0 │ 0.0 │ 10030 │ └──────────┴──────────────────────┴───────────────────────┴─────────────────┴───────────────┴────────────┴────────────────────┴──────────────┴──────────────┴──────────────┴─────────────┴─────────┴─────────┴────────────┴──────────────┴───────────────────────┴──────────────┴──────────────────────┴─────────────┴─────────────┘
7 day trip? looks like the trip distance is zero, we can decide to remove the row from future calculations of average Let’s remove the outliers and join with a lookup table to get more information about the “where” of our analytical datasets - zones.
= (
nycjanduration_new filter(nycjanduration.trip_distance != 0.0)
nycjanduration.
)
nycjanduration_new
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ ┃ vendorid ┃ tpep_pickup_datetime ┃ tpep_dropoff_datetime ┃ passenger_count ┃ trip_distance ┃ ratecodeid ┃ store_and_fwd_flag ┃ pulocationid ┃ dolocationid ┃ payment_type ┃ fare_amount ┃ extra ┃ mta_tax ┃ tip_amount ┃ tolls_amount ┃ improvement_surcharge ┃ total_amount ┃ congestion_surcharge ┃ airport_fee ┃ rideminutes ┃ ┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ │ int64 │ timestamp(3) │ timestamp(3) │ float64 │ float64 │ float64 │ string │ int64 │ int64 │ int64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ int64 │ ├──────────┼──────────────────────┼───────────────────────┼─────────────────┼───────────────┼────────────┼────────────────────┼──────────────┼──────────────┼──────────────┼─────────────┼─────────┼─────────┼────────────┼──────────────┼───────────────────────┼──────────────┼──────────────────────┼─────────────┼─────────────┤ │ 2 │ 2023-01-01 00:32:10 │ 2023-01-01 00:40:36 │ 1.0 │ 0.97 │ 1.0 │ N │ 161 │ 141 │ 2 │ 9.3 │ 1.0 │ 0.5 │ 0.00 │ 0.0 │ 1.0 │ 14.30 │ 2.5 │ 0.0 │ 8 │ │ 2 │ 2023-01-01 00:55:08 │ 2023-01-01 01:01:27 │ 1.0 │ 1.10 │ 1.0 │ N │ 43 │ 237 │ 1 │ 7.9 │ 1.0 │ 0.5 │ 4.00 │ 0.0 │ 1.0 │ 16.90 │ 2.5 │ 0.0 │ 6 │ │ 2 │ 2023-01-01 00:25:04 │ 2023-01-01 00:37:49 │ 1.0 │ 2.51 │ 1.0 │ N │ 48 │ 238 │ 1 │ 14.9 │ 1.0 │ 0.5 │ 15.00 │ 0.0 │ 1.0 │ 34.90 │ 2.5 │ 0.0 │ 12 │ │ 2 │ 2023-01-01 00:10:29 │ 2023-01-01 00:21:19 │ 1.0 │ 1.43 │ 1.0 │ N │ 107 │ 79 │ 1 │ 11.4 │ 1.0 │ 0.5 │ 3.28 │ 0.0 │ 1.0 │ 19.68 │ 2.5 │ 0.0 │ 11 │ │ 2 │ 2023-01-01 00:50:34 │ 2023-01-01 01:02:52 │ 1.0 │ 1.84 │ 1.0 │ N │ 161 │ 137 │ 1 │ 12.8 │ 1.0 │ 0.5 │ 10.00 │ 0.0 │ 1.0 │ 27.80 │ 2.5 │ 0.0 │ 12 │ │ 2 │ 2023-01-01 00:09:22 │ 2023-01-01 00:19:49 │ 1.0 │ 1.66 │ 1.0 │ N │ 239 │ 143 │ 1 │ 12.1 │ 1.0 │ 0.5 │ 3.42 │ 0.0 │ 1.0 │ 20.52 │ 2.5 │ 0.0 │ 10 │ │ 2 │ 2023-01-01 00:27:12 │ 2023-01-01 00:49:56 │ 1.0 │ 11.70 │ 1.0 │ N │ 142 │ 200 │ 1 │ 45.7 │ 1.0 │ 0.5 │ 10.74 │ 3.0 │ 1.0 │ 64.44 │ 2.5 │ 0.0 │ 22 │ │ 2 │ 2023-01-01 00:21:44 │ 2023-01-01 00:36:40 │ 1.0 │ 2.95 │ 1.0 │ N │ 164 │ 236 │ 1 │ 17.7 │ 1.0 │ 0.5 │ 5.68 │ 0.0 │ 1.0 │ 28.38 │ 2.5 │ 0.0 │ 15 │ │ 2 │ 2023-01-01 00:39:42 │ 2023-01-01 00:50:36 │ 1.0 │ 3.01 │ 1.0 │ N │ 141 │ 107 │ 2 │ 14.9 │ 1.0 │ 0.5 │ 0.00 │ 0.0 │ 1.0 │ 19.90 │ 2.5 │ 0.0 │ 11 │ │ 2 │ 2023-01-01 00:53:01 │ 2023-01-01 01:01:45 │ 1.0 │ 1.80 │ 1.0 │ N │ 234 │ 68 │ 1 │ 11.4 │ 1.0 │ 0.5 │ 3.28 │ 0.0 │ 1.0 │ 19.68 │ 2.5 │ 0.0 │ 8 │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └──────────┴──────────────────────┴───────────────────────┴─────────────────┴───────────────┴────────────┴────────────────────┴──────────────┴──────────────┴──────────────┴─────────────┴─────────┴─────────┴────────────┴──────────────┴───────────────────────┴──────────────┴──────────────────────┴─────────────┴─────────────┘
Let’s create a cleaner set similar to before.
= nycjanduration_new.agg(
insights_new
["total_trips"),
ibis._.count().name("total_amount"].sum().name("total_revenue"),
ibis._["trip_distance"].sum().name("total_distance_all"),
ibis._["rideminutes"].max().name("longest"),
ibis._["rideminutes"].mean().round(2).name("average_ride")
ibis._[
]
) insights_new
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓ ┃ total_trips ┃ total_revenue ┃ total_distance_all ┃ longest ┃ average_ride ┃ ┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩ │ int64 │ float64 │ float64 │ int64 │ float64 │ ├─────────────┼───────────────┼────────────────────┼─────────┼──────────────┤ │ 2906670 │ 7.838373e+07 │ 1.015018e+07 │ 6179 │ 15.76 │ └─────────────┴───────────────┴────────────────────┴─────────┴──────────────┘
You can already see a slightly more massaged dataset - the longest trip is lower, alongside average_ride has changed and the total number of trips has gone down by almost 40k
Next up we want to do something more powerful - join with related datasets to get more insights on geographical behaviour of taxi trips around NYC. Let’s look over the zonelookup table again.
zonelookup
┏━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓ ┃ locationid ┃ borough ┃ zone ┃ service_zone ┃ ┡━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩ │ string │ string │ string │ string │ ├────────────┼───────────────┼─────────────────────────┼──────────────┤ │ 1 │ EWR │ Newark Airport │ EWR │ │ 2 │ Queens │ Jamaica Bay │ Boro Zone │ │ 3 │ Bronx │ Allerton/Pelham Gardens │ Boro Zone │ │ 4 │ Manhattan │ Alphabet City │ Yellow Zone │ │ 5 │ Staten Island │ Arden Heights │ Boro Zone │ │ 6 │ Staten Island │ Arrochar/Fort Wadsworth │ Boro Zone │ │ 7 │ Queens │ Astoria │ Boro Zone │ │ 8 │ Queens │ Astoria Park │ Boro Zone │ │ 9 │ Queens │ Auburndale │ Boro Zone │ │ 10 │ Queens │ Baisley Park │ Boro Zone │ │ … │ … │ … │ … │ └────────────┴───────────────┴─────────────────────────┴──────────────┘
We can see pulocationid is int64, so we must cast to have the tables fully joined. Ibis supports casting data types within its library as well. In the line below, we use .cast(“str”) to ensure the two tables can be joined together. You can try without the cast and see what happens :).
= nycjanduration_new.inner_join(zonelookup, nycjanduration_new.pulocationid.cast("str") == zonelookup.locationid) joineddata
joineddata
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓ ┃ vendorid ┃ tpep_pickup_datetime ┃ tpep_dropoff_datetime ┃ passenger_count ┃ trip_distance ┃ ratecodeid ┃ store_and_fwd_flag ┃ pulocationid ┃ dolocationid ┃ payment_type ┃ fare_amount ┃ extra ┃ mta_tax ┃ tip_amount ┃ tolls_amount ┃ improvement_surcharge ┃ total_amount ┃ congestion_surcharge ┃ airport_fee ┃ rideminutes ┃ locationid ┃ borough ┃ zone ┃ service_zone ┃ ┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩ │ int64 │ timestamp(3) │ timestamp(3) │ float64 │ float64 │ float64 │ string │ int64 │ int64 │ int64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ int64 │ string │ string │ string │ string │ ├──────────┼──────────────────────┼───────────────────────┼─────────────────┼───────────────┼────────────┼────────────────────┼──────────────┼──────────────┼──────────────┼─────────────┼─────────┼─────────┼────────────┼──────────────┼───────────────────────┼──────────────┼──────────────────────┼─────────────┼─────────────┼────────────┼───────────┼───────────────────────┼──────────────┤ │ 2 │ 2023-01-01 00:32:10 │ 2023-01-01 00:40:36 │ 1.0 │ 0.97 │ 1.0 │ N │ 161 │ 141 │ 2 │ 9.3 │ 1.0 │ 0.5 │ 0.00 │ 0.0 │ 1.0 │ 14.30 │ 2.5 │ 0.0 │ 8 │ 161 │ Manhattan │ Midtown Center │ Yellow Zone │ │ 2 │ 2023-01-01 00:55:08 │ 2023-01-01 01:01:27 │ 1.0 │ 1.10 │ 1.0 │ N │ 43 │ 237 │ 1 │ 7.9 │ 1.0 │ 0.5 │ 4.00 │ 0.0 │ 1.0 │ 16.90 │ 2.5 │ 0.0 │ 6 │ 43 │ Manhattan │ Central Park │ Yellow Zone │ │ 2 │ 2023-01-01 00:25:04 │ 2023-01-01 00:37:49 │ 1.0 │ 2.51 │ 1.0 │ N │ 48 │ 238 │ 1 │ 14.9 │ 1.0 │ 0.5 │ 15.00 │ 0.0 │ 1.0 │ 34.90 │ 2.5 │ 0.0 │ 12 │ 48 │ Manhattan │ Clinton East │ Yellow Zone │ │ 2 │ 2023-01-01 00:10:29 │ 2023-01-01 00:21:19 │ 1.0 │ 1.43 │ 1.0 │ N │ 107 │ 79 │ 1 │ 11.4 │ 1.0 │ 0.5 │ 3.28 │ 0.0 │ 1.0 │ 19.68 │ 2.5 │ 0.0 │ 11 │ 107 │ Manhattan │ Gramercy │ Yellow Zone │ │ 2 │ 2023-01-01 00:50:34 │ 2023-01-01 01:02:52 │ 1.0 │ 1.84 │ 1.0 │ N │ 161 │ 137 │ 1 │ 12.8 │ 1.0 │ 0.5 │ 10.00 │ 0.0 │ 1.0 │ 27.80 │ 2.5 │ 0.0 │ 12 │ 161 │ Manhattan │ Midtown Center │ Yellow Zone │ │ 2 │ 2023-01-01 00:09:22 │ 2023-01-01 00:19:49 │ 1.0 │ 1.66 │ 1.0 │ N │ 239 │ 143 │ 1 │ 12.1 │ 1.0 │ 0.5 │ 3.42 │ 0.0 │ 1.0 │ 20.52 │ 2.5 │ 0.0 │ 10 │ 239 │ Manhattan │ Upper West Side South │ Yellow Zone │ │ 2 │ 2023-01-01 00:27:12 │ 2023-01-01 00:49:56 │ 1.0 │ 11.70 │ 1.0 │ N │ 142 │ 200 │ 1 │ 45.7 │ 1.0 │ 0.5 │ 10.74 │ 3.0 │ 1.0 │ 64.44 │ 2.5 │ 0.0 │ 22 │ 142 │ Manhattan │ Lincoln Square East │ Yellow Zone │ │ 2 │ 2023-01-01 00:21:44 │ 2023-01-01 00:36:40 │ 1.0 │ 2.95 │ 1.0 │ N │ 164 │ 236 │ 1 │ 17.7 │ 1.0 │ 0.5 │ 5.68 │ 0.0 │ 1.0 │ 28.38 │ 2.5 │ 0.0 │ 15 │ 164 │ Manhattan │ Midtown South │ Yellow Zone │ │ 2 │ 2023-01-01 00:39:42 │ 2023-01-01 00:50:36 │ 1.0 │ 3.01 │ 1.0 │ N │ 141 │ 107 │ 2 │ 14.9 │ 1.0 │ 0.5 │ 0.00 │ 0.0 │ 1.0 │ 19.90 │ 2.5 │ 0.0 │ 11 │ 141 │ Manhattan │ Lenox Hill West │ Yellow Zone │ │ 2 │ 2023-01-01 00:53:01 │ 2023-01-01 01:01:45 │ 1.0 │ 1.80 │ 1.0 │ N │ 234 │ 68 │ 1 │ 11.4 │ 1.0 │ 0.5 │ 3.28 │ 0.0 │ 1.0 │ 19.68 │ 2.5 │ 0.0 │ 8 │ 234 │ Manhattan │ Union Sq │ Yellow Zone │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └──────────┴──────────────────────┴───────────────────────┴─────────────────┴───────────────┴────────────┴────────────────────┴──────────────┴──────────────┴──────────────┴─────────────┴─────────┴─────────┴────────────┴──────────────┴───────────────────────┴──────────────┴──────────────────────┴─────────────┴─────────────┴────────────┴───────────┴───────────────────────┴──────────────┘
Now we can do more cool things in ibis with group bys and aggregate by with zones and boroughs!
= (
groupbyboroughtrips
joineddata"zone")
.group_by(
.aggregate(=joineddata.vendorid.count(),
trips=joineddata.fare_amount.sum(),
totalrev=joineddata.passenger_count.sum(),
totalpassengers=joineddata.rideminutes.mean().round(2)
averageride
)"totalrev"))
.order_by(ibis.desc(10)
.limit(
)
groupbyboroughtrips
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ ┃ zone ┃ trips ┃ totalrev ┃ totalpassengers ┃ averageride ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ │ string │ int64 │ float64 │ float64 │ float64 │ ├──────────────────────────────┼────────┼──────────────┼─────────────────┼─────────────┤ │ JFK Airport │ 154492 │ 9.148091e+06 │ 223590.0 │ 37.12 │ │ LaGuardia Airport │ 87352 │ 3.577509e+06 │ 118834.0 │ 26.77 │ │ Midtown Center │ 130031 │ 1.956489e+06 │ 179987.0 │ 14.58 │ │ Upper East Side South │ 142043 │ 1.732204e+06 │ 191430.0 │ 11.78 │ │ Upper East Side North │ 131581 │ 1.708854e+06 │ 179243.0 │ 12.53 │ │ Times Sq/Theatre District │ 95098 │ 1.623739e+06 │ 140826.0 │ 15.71 │ │ Penn Station/Madison Sq West │ 105566 │ 1.619161e+06 │ 142369.0 │ 15.60 │ │ Midtown East │ 101456 │ 1.490681e+06 │ 136471.0 │ 14.05 │ │ Lincoln Square East │ 95530 │ 1.282554e+06 │ 133103.0 │ 12.64 │ │ Murray Hill │ 84567 │ 1.237674e+06 │ 114587.0 │ 13.50 │ └──────────────────────────────┴────────┴──────────────┴─────────────────┴─────────────┘
If you want to see what sql ibis generates, you can use the ibis.to_sql() method.
ibis.to_sql(groupbyboroughtrips)
WITH t0 AS (
SELECT
AS vendorid,
t3.vendorid AS tpep_pickup_datetime,
t3.tpep_pickup_datetime AS tpep_dropoff_datetime,
t3.tpep_dropoff_datetime AS passenger_count,
t3.passenger_count AS trip_distance,
t3.trip_distance AS ratecodeid,
t3.ratecodeid AS store_and_fwd_flag,
t3.store_and_fwd_flag AS pulocationid,
t3.pulocationid AS dolocationid,
t3.dolocationid AS payment_type,
t3.payment_type AS fare_amount,
t3.fare_amount AS extra,
t3.extra AS mta_tax,
t3.mta_tax AS tip_amount,
t3.tip_amount AS tolls_amount,
t3.tolls_amount AS improvement_surcharge,
t3.improvement_surcharge AS total_amount,
t3.total_amount AS congestion_surcharge,
t3.congestion_surcharge AS airport_fee
t3.airport_fee FROM dh_nyctaxi_video.taxizonenyc AS t3
WHERE
<> 0 OR FALSE
t3.passenger_count AS (
), t1 SELECT
AS vendorid,
t0.vendorid AS tpep_pickup_datetime,
t0.tpep_pickup_datetime AS tpep_dropoff_datetime,
t0.tpep_dropoff_datetime AS passenger_count,
t0.passenger_count AS trip_distance,
t0.trip_distance AS ratecodeid,
t0.ratecodeid AS store_and_fwd_flag,
t0.store_and_fwd_flag AS pulocationid,
t0.pulocationid AS dolocationid,
t0.dolocationid AS payment_type,
t0.payment_type AS fare_amount,
t0.fare_amount AS extra,
t0.extra AS mta_tax,
t0.mta_tax AS tip_amount,
t0.tip_amount AS tolls_amount,
t0.tolls_amount AS improvement_surcharge,
t0.improvement_surcharge AS total_amount,
t0.total_amount AS congestion_surcharge,
t0.congestion_surcharge AS airport_fee,
t0.airport_fee
DATE_DIFF('minute',
'minute', t0.tpep_pickup_datetime),
DATE_TRUNC('minute', t0.tpep_dropoff_datetime)
DATE_TRUNC(AS rideminutes
) FROM t0
WHERE
<> 0.0
t0.trip_distance
)SELECT
zone,
t2.
t2.trips,
t2.totalrev,
t2.totalpassengers,
t2.averagerideFROM (
SELECT
zone AS zone,
t3.COUNT(t1.vendorid) AS trips,
SUM(t1.fare_amount) AS totalrev,
SUM(t1.passenger_count) AS totalpassengers,
ROUND(AVG(t1.rideminutes), 2) AS averageride
FROM t1
JOIN dh_nyctaxi_video.zonelookup AS t3
ON CAST(t1.pulocationid AS VARCHAR) = t3.locationid
GROUP BY
1
AS t2
) ORDER BY
DESC
t2.totalrev LIMIT 10
Airport rides give the most revenue to taxi companies, that makes a lot of sense.
Let’s write our result tables back to trino (to show some write functionality, of course).
"groupbyboroughtrips_new", groupbyboroughtrips) con.create_table(
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ ┃ zone ┃ trips ┃ totalrev ┃ totalpassengers ┃ averageride ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ │ string │ int64 │ float64 │ float64 │ float64 │ ├──────────────────────────────┼────────┼──────────────┼─────────────────┼─────────────┤ │ JFK Airport │ 154492 │ 9.148091e+06 │ 223590.0 │ 37.12 │ │ LaGuardia Airport │ 87352 │ 3.577509e+06 │ 118834.0 │ 26.77 │ │ Midtown Center │ 130031 │ 1.956489e+06 │ 179987.0 │ 14.58 │ │ Upper East Side South │ 142043 │ 1.732204e+06 │ 191430.0 │ 11.78 │ │ Upper East Side North │ 131581 │ 1.708854e+06 │ 179243.0 │ 12.53 │ │ Times Sq/Theatre District │ 95098 │ 1.623739e+06 │ 140826.0 │ 15.71 │ │ Penn Station/Madison Sq West │ 105566 │ 1.619161e+06 │ 142369.0 │ 15.60 │ │ Midtown East │ 101456 │ 1.490681e+06 │ 136471.0 │ 14.05 │ │ Lincoln Square East │ 95530 │ 1.282554e+06 │ 133103.0 │ 12.64 │ │ Murray Hill │ 84567 │ 1.237674e+06 │ 114587.0 │ 13.50 │ └──────────────────────────────┴────────┴──────────────┴─────────────────┴─────────────┘
There you have it, a quick tutorial with Ibis, and Starburst Galaxy!