import pandas as pd

fact = (
    pd.DataFrame({
        'dim_id': [1, 2, 3], 
        'val_x': ['x1', 'x2', 'x3']
    })
)
dim = (
    pd.DataFrame({
        'dim_id': [1, 2, 4], 
        'val_y': ['y1', 'y2', 'y4']
    })
)


fact

dim


inner = (
    fact.merge(
        dim,
        how="inner",
        on="dim_id"
    )
)
inner


left = (
    fact.merge(
        dim,
        how="left",
        on="dim_id"
    )
)
left


right = (
    fact.merge(
        dim,
        how="right",
        on="dim_id"
    )
)
right


outer = (
    fact.merge(
        dim,
        how="outer",
        on="dim_id"
    )
)
outer


cross = (
    fact.merge(
        dim,
        how="cross"
    )
)
cross


fact_renamed = (
    fact.rename(
        columns={"dim_id": "fact_id"}
    )
)

merge_different_cols = (
    fact_renamed.merge(
        dim,
        how="inner",
        left_on="fact_id",
        right_on="dim_id"
    )
)
merge_different_cols


# `assign()` creates a new column and
# returns a new data frame with that column:
fact_new_col = fact.assign(
    dim_id2=[1, 2, 3]
)
dim_new_col = dim.assign(
    dim_id2=[1, 2, 3]
)

fact_new_col.merge(
    dim_new_col,
    how="inner",
    on=["dim_id", "dim_id2"]
)


fact_index = fact.set_index("dim_id")
dim_index = dim.set_index("dim_id")

# Remember, these new data frames are now
# indexed by "dim_id" rather than row
# number:
fact_index


inner_join = (
    fact_index
    .join(
        # no "on" argument
        dim_index,
        how="inner"
    )
)

inner_join


planes = pd.read_csv("../data/planes.csv")

tailnum_of_interest = planes['tailnum'] == 'N10156'
planes[tailnum_of_interest]


planes[tailnum_of_interest]['seats'] = 60

/var/folders/j3/v1318ng94fvdpq7kzr0hq9kw0000gn/T/ipykernel_6733/1344183120.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  planes[tailnum_of_interest]['seats'] = 60


planes[tailnum_of_interest]['seats']

0    55
Name: seats, dtype: int64


planes[tailnum_of_interest]._is_view

False


planes.loc[
    tailnum_of_interest,
    "seats"
] = 60

planes[tailnum_of_interest]['seats']

0    60
Name: seats, dtype: int64


def sum_squares(x, y):
    # Assume this is a much larger function,
    # with complex, branching logic.
    return x**2 + y**2


%%time
planes.apply(
    lambda row: sum_squares(
        row["seats"], row["engines"]
    ),
    axis=1
).head(5)

CPU times: user 16.3 ms, sys: 571 µs, total: 16.9 ms
Wall time: 16.9 ms

0     3604
1    33128
2    33128
3    33128
4     3029
dtype: int64


import numpy as np


%%time
np_result = np.vectorize(sum_squares)(
    planes["seats"], planes["engines"]
)
np_result[:5]

CPU times: user 1.72 ms, sys: 82 µs, total: 1.8 ms
Wall time: 1.8 ms

array([ 3604, 33128, 33128, 33128,  3029])


planes["silly_col"] = np_result


%%time
(planes["seats"]**2 + planes["engines"]**2).head(5)

CPU times: user 346 µs, sys: 27 µs, total: 373 µs
Wall time: 367 µs

0     3604
1    33128
2    33128
3    33128
4     3029
dtype: int64


from multiprocessing import Pool


def sum_squares_df(df):
    df["silly_col"] = (
        df["seats"]**2 + 
        df["engines"]**2
    )
    return df


import sum_squares


cores = 4
pool = Pool(cores)


planes_split = np.array_split(planes, cores)


split_work = pool.map(
    sum_squares.sum_squares_df,
    planes_split
)


df = pd.concat(split_work)


pool.close()
pool.join()


def parallel_apply(
    df, func, cores
):
    pool = Pool(cores)
    splits = np.array_split(df, cores)
    split_work = pool.map(
        func, splits
    )
    df = pd.concat(split_work)
    pool.close()
    pool.join()
    
    return df


sales = pd.read_csv("../data/cj_sample.csv")

# We need to convert a string to a Timestamp
# with `to_datetime()`:
sales["transaction_timestamp"] = (
    pd.to_datetime(
        sales["transaction_timestamp"]
    )
)


sales["sales_value"].plot.hist();


sales[
    sales["sales_value"] > 0
]["sales_value"].plot.hist(
    log=True, bins=30
);


sales_indexed = (
    sales.set_index("transaction_timestamp")
    ["sales_value"]
)

(sales_indexed
 .plot.line(figsize=(10,4)));


(
    sales_indexed
    .resample('D')
    .sum()
    .plot.line(figsize=(10,4))
);


day_order = [
    'Monday', 'Tuesday', 'Wednesday',
    'Thursday', 'Friday', 'Saturday',
    'Sunday'
]

total_sales_by_weekday = (
    sales_indexed
    .resample('D')
    .sum()
    .rename(lambda idx: idx.day_name())
    .groupby('transaction_timestamp')
    .quantile([.25, .5, .75])
    .unstack()
    .reindex(day_order)
)


total_sales_by_weekday.plot.line(
    title='Median and IQR of total sales by DOW',
    figsize=(10,4)
);


(
    sales_indexed
    .resample('D')
    .sum()
    .rename(lambda idx: idx.day_name())
    .groupby('transaction_timestamp')
    .quantile([.25, .5, .75])
    .unstack()
    .reindex(day_order)
)


sales.plot.scatter(
    x='quantity', 
    y='sales_value', 
    title='Sales versus quantity', 
    figsize=(8,4)
);


dept_sales = (
    sales.groupby("department", as_index=False)
    .agg({"sales_value": "sum"})
    .sort_values("sales_value", ascending=False)
    .head(5)
)


(dept_sales
 .sort_values('sales_value')
 .plot.barh(
      x='department', 
      y='sales_value', 
      color='red'
));


total_daily_discounts = (
    sales[sales["department"] == "GROCERY"]
    .set_index('transaction_timestamp')
    .loc[:, ['retail_disc', 'coupon_disc', 'coupon_match_disc']]
    .resample('D')
    .sum()
)

total_daily_discounts.head()


total_daily_discounts.plot.line(
    logy=True, figsize=(10, 4)
);

	0.25	0.50	0.75
transaction_timestamp
Monday	634.9300	686.720	732.2500
Tuesday	574.1600	650.735	732.3175
Wednesday	568.6600	622.830	705.5575
Thursday	562.6550	634.630	709.5600
Friday	611.8275	677.125	739.8250
Saturday	769.0600	867.095	954.2750
Sunday	862.2900	948.430	990.9600

	retail_disc	coupon_disc	coupon_match_disc
transaction_timestamp
2017-01-01	50.70	0.0	0.0
2017-01-02	82.02	0.4	0.4
2017-01-03	68.34	1.0	0.0
2017-01-04	71.44	1.4	0.4
2017-01-05	56.66	0.0	0.0

Advanced Pandas¶

Learning Objectives¶

Joining Data Frames¶

Inner Join¶

Outer Join¶

Left Outer Join¶

Right Outer Join¶

Full Outer Join¶

Cross Join¶

Differing Keys¶

Multiple Keys¶

`join()` vs `merge()`¶

Views vs. Copies¶

Vectorization and Parallelization¶

Vectorization¶

Parallelization¶

Plotting in Pandas¶

Plotting a Series¶

Other Types of Plot¶

Plotting a Data Frame¶

Questions¶

	dim_id_x	val_x	dim_id_y	val_y
0	1	x1	1	y1
1	1	x1	2	y2
2	1	x1	4	y4
3	2	x2	1	y1
4	2	x2	2	y2
5	2	x2	4	y4
6	3	x3	1	y1
7	3	x3	2	y2
8	3	x3	4	y4

	dim_id	val_x
0	1	x1
1	2	x2
2	3	x3

	dim_id	val_y
0	1	y1
1	2	y2
2	4	y4

	dim_id	val_x	val_y
0	1	x1	y1
1	2	x2	y2
2	3	x3	NaN

	dim_id	val_x	val_y
0	1	x1	y1
1	2	x2	y2
2	4	NaN	y4

	dim_id_x	val_x	dim_id_y	val_y
0	1	x1	1	y1
1	1	x1	2	y2
2	1	x1	4	y4
3	2	x2	1	y1
4	2	x2	2	y2
5	2	x2	4	y4
6	3	x3	1	y1
7	3	x3	2	y2
8	3	x3	4	y4

Advanced Pandas¶

Learning Objectives¶

Joining Data Frames¶

Inner Join¶

Outer Join¶

Left Outer Join¶

Right Outer Join¶

Full Outer Join¶

Cross Join¶

Differing Keys¶

Multiple Keys¶

join() vs merge()¶

Views vs. Copies¶

Vectorization and Parallelization¶

Vectorization¶

Parallelization¶

Plotting in Pandas¶

Plotting a Series¶

Other Types of Plot¶

Plotting a Data Frame¶

Questions¶

`join()` vs `merge()`¶

	dim_id_x	val_x	dim_id_y	val_y
0	1	x1	1	y1
1	1	x1	2	y2
2	1	x1	4	y4
3	2	x2	1	y1
4	2	x2	2	y2
5	2	x2	4	y4
6	3	x3	1	y1
7	3	x3	2	y2
8	3	x3	4	y4