import pandas as pd

planes = pd.read_csv('../data/planes.csv')
planes.head(n=3)


planes.dtypes

tailnum          object
year            float64
type             object
manufacturer     object
model            object
engines           int64
seats             int64
speed           float64
engine           object
dtype: object


planes["engines"].head(n=3)

0    2
1    2
2    2
Name: engines, dtype: int64


planes.index

RangeIndex(start=0, stop=3322, step=1)


pd.date_range(
    start="2022-01-01",
    end="2022-01-02",
    freq="h"  # for hour
)

DatetimeIndex(['2022-01-01 00:00:00', '2022-01-01 01:00:00',
               '2022-01-01 02:00:00', '2022-01-01 03:00:00',
               '2022-01-01 04:00:00', '2022-01-01 05:00:00',
               '2022-01-01 06:00:00', '2022-01-01 07:00:00',
               '2022-01-01 08:00:00', '2022-01-01 09:00:00',
               '2022-01-01 10:00:00', '2022-01-01 11:00:00',
               '2022-01-01 12:00:00', '2022-01-01 13:00:00',
               '2022-01-01 14:00:00', '2022-01-01 15:00:00',
               '2022-01-01 16:00:00', '2022-01-01 17:00:00',
               '2022-01-01 18:00:00', '2022-01-01 19:00:00',
               '2022-01-01 20:00:00', '2022-01-01 21:00:00',
               '2022-01-01 22:00:00', '2022-01-01 23:00:00',
               '2022-01-02 00:00:00'],
              dtype='datetime64[ns]', freq='H')


planes.head()


planes.head().loc[
    planes['manufacturer'] == 'EMBRAER',
    ['year', 'engines']
]


planes['year'].head()

0    2004.0
1    1998.0
2    1999.0
3    1999.0
4    2002.0
Name: year, dtype: float64


planes[['year']].head()


planes[['year', 'engines']].head()


planes.loc[0:5]


planes.loc[[0, 2, 4, 6, 8]]


planes['manufacturer'] == 'EMBRAER'

0        True
1       False
2       False
3       False
4        True
        ...  
3317    False
3318    False
3319    False
3320    False
3321    False
Name: manufacturer, Length: 3322, dtype: bool


planes[planes['manufacturer'] == 'EMBRAER'].head()


planes.loc[
    planes['manufacturer'] == 'EMBRAER',
    ["tailnum", "year"]
].head()


planes.loc[(planes['year'] > 2002) & (planes['year'] < 2004)].head()


ames = pd.read_csv('../data/ames_raw.csv')
ames.head()


ames.rename(
    columns={
        'MS SubClass': 'ms_subclass',
        'MS Zoning': 'ms_zoning'
    }
).head()


ames.columns = [c.lower() for c in ames.columns]
ames.head()


sale_price = ames['saleprice']
sale_price

0       215000
1       105000
2       172000
3       244000
4       189900
         ...  
2925    142500
2926    131000
2927    132000
2928    170000
2929    188000
Name: saleprice, Length: 2930, dtype: int64


sale_price_k = sale_price / 1000
sale_price_k

0       215.0
1       105.0
2       172.0
3       244.0
4       189.9
        ...  
2925    142.5
2926    131.0
2927    132.0
2928    170.0
2929    188.0
Name: saleprice, Length: 2930, dtype: float64


(ames['saleprice'] - 12).head(n=2)

0    214988
1    104988
Name: saleprice, dtype: int64


(ames['saleprice'] * 10).head(n=2)

0    2150000
1    1050000
Name: saleprice, dtype: int64


(ames['saleprice'] ** 2).head(n=2)

0    46225000000
1    11025000000
Name: saleprice, dtype: int64

df['new_column_name'] = new_column_series


ames['sale_price_k'] = sale_price_k
ames.head()


ames['sale_price_k'] = sale_price_k / 10


ames = ames.drop(columns=['sale_price_k'])
ames.head()


price_per_sqft = ames['saleprice'] / ames['gr liv area']
price_per_sqft.head()

0    129.830918
1    117.187500
2    129.420617
3    115.639810
4    116.574586
dtype: float64


ames['price_per_sqft'] = price_per_sqft
ames.head()


('Home in ' + 
 ames['neighborhood'] + 
 ' neighborhood sold under ' + 
 ames['sale condition'] + 
 ' condition'
).head(n=5)

0    Home in NAmes neighborhood sold under Normal c...
1    Home in NAmes neighborhood sold under Normal c...
2    Home in NAmes neighborhood sold under Normal c...
3    Home in NAmes neighborhood sold under Normal c...
4    Home in Gilbert neighborhood sold under Normal...
dtype: object


# number of characters in string
ames['neighborhood'].str.len().head(n=5)

0    5
1    5
2    5
3    5
4    7
Name: neighborhood, dtype: int64


ames['garage type'].str.lower().str.replace('tchd', 'tached').head(n=5)

0    attached
1    attached
2    attached
3    attached
4    attached
Name: garage type, dtype: object


ames.isnull().head(n=5)


ames.isnull().sum()

order               0
pid                 0
ms subclass         0
ms zoning           0
lot frontage      490
                 ... 
yr sold             0
sale type           0
sale condition      0
saleprice           0
price_per_sqft      0
Length: 83, dtype: int64


# identify if missing values exist in each column:
missing = ames.isnull().any()

# subset for just those columns that have missing values:
ames[missing[missing].index].head(n=5)


ames.dropna()


import numpy as np

# example DataFrame with missing values
df = pd.DataFrame([[np.nan, 2, np.nan, 0],
                   [3, 4, np.nan, 1],
                   [np.nan, np.nan, np.nan, 5],
                   [np.nan, 3, np.nan, 4]],
                  columns=list('ABCD'))
df


df.fillna(0)  # fill with 0


df.fillna(df.mean())  # fill with the mean


df.fillna(method='bfill')  # backward (upwards) fill from non-nan values


df.fillna(method='ffill')  # forward (downward) fill from non-nan values


def is_luxury_home(x):
    if x > 500000:
        return 'Luxury'
    else:
        return 'Non-luxury'

ames['saleprice'].apply(is_luxury_home)

0       Non-luxury
1       Non-luxury
2       Non-luxury
3       Non-luxury
4       Non-luxury
           ...    
2925    Non-luxury
2926    Non-luxury
2927    Non-luxury
2928    Non-luxury
2929    Non-luxury
Name: saleprice, Length: 2930, dtype: object


def is_luxury_home(x, price):
    if x > price:
        return 'Luxury'
    else:
        return 'Non-luxury'

ames['saleprice'].apply(is_luxury_home, price=200000)

0           Luxury
1       Non-luxury
2       Non-luxury
3           Luxury
4       Non-luxury
           ...    
2925    Non-luxury
2926    Non-luxury
2927    Non-luxury
2928    Non-luxury
2929    Non-luxury
Name: saleprice, Length: 2930, dtype: object


def convert_to_sq_meters(x):
    return x*0.092903

ames[
    ['gr liv area', 'garage area', 'lot area']
].applymap(convert_to_sq_meters).head(n=5)


ames[['saleprice', 'gr liv area']].mean()

saleprice      180796.060068
gr liv area      1499.690444
dtype: float64


ames[['saleprice', 'gr liv area']].median()

saleprice      160000.0
gr liv area      1442.0
dtype: float64


ames.describe()


ames.agg({
    'saleprice': ['mean']
})


ames.agg({
    'saleprice': ['mean'],
    'gr liv area': ['mean']
})


ames.agg({
    'saleprice': ['mean', 'median'],
    'gr liv area': ['mean', 'min']
})


ames_grp = ames.groupby('neighborhood')
ames_grp

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x129388ee0>


agg_ames = (
    ames.groupby('neighborhood')
    .agg({'saleprice': ['mean', 'median']})
    .head()
)
agg_ames


agg_ames.index

Index(['Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr'], dtype='object', name='neighborhood')


agg_ames.loc[["Blmngtn", "ClearCr"]]

# But try agg_ames.iloc[[0, -1]]!


(
    ames.groupby('neighborhood', as_index=False)
    .agg({'saleprice': ['mean', 'median']})
    .head()
)


agg_ames_multi = (
    ames.groupby(['neighborhood', 'yr sold'])
    .agg({'saleprice': 'mean'})
)
agg_ames_multi


agg_ames_multi.index

MultiIndex([('Blmngtn', 2006),
            ('Blmngtn', 2007),
            ('Blmngtn', 2008),
            ('Blmngtn', 2009),
            ('Blmngtn', 2010),
            ('Blueste', 2007),
            ('Blueste', 2008),
            ('Blueste', 2009),
            ('Blueste', 2010),
            ( 'BrDale', 2006),
            ...
            ('StoneBr', 2010),
            ( 'Timber', 2006),
            ( 'Timber', 2007),
            ( 'Timber', 2008),
            ( 'Timber', 2009),
            ( 'Timber', 2010),
            ('Veenker', 2006),
            ('Veenker', 2007),
            ('Veenker', 2008),
            ('Veenker', 2009)],
           names=['neighborhood', 'yr sold'], length=130)

	tailnum	year	type	manufacturer	model	engines	seats	speed	engine
0	N10156	2004.0	Fixed wing multi engine	EMBRAER	EMB-145XR	2	55	NaN	Turbo-fan
1	N102UW	1998.0	Fixed wing multi engine	AIRBUS INDUSTRIE	A320-214	2	182	NaN	Turbo-fan
2	N103US	1999.0	Fixed wing multi engine	AIRBUS INDUSTRIE	A320-214	2	182	NaN	Turbo-fan

	tailnum	year	type	manufacturer	model	engines	seats	speed	engine
0	N10156	2004.0	Fixed wing multi engine	EMBRAER	EMB-145XR	2	60	NaN	Turbo-fan
1	N102UW	1998.0	Fixed wing multi engine	AIRBUS INDUSTRIE	A320-214	2	182	NaN	Turbo-fan
2	N103US	1999.0	Fixed wing multi engine	AIRBUS INDUSTRIE	A320-214	2	182	NaN	Turbo-fan
3	N104UW	1999.0	Fixed wing multi engine	AIRBUS INDUSTRIE	A320-214	2	182	NaN	Turbo-fan
4	N10575	2002.0	Fixed wing multi engine	EMBRAER	EMB-145LR	2	55	NaN	Turbo-fan

	tailnum	year	type	manufacturer	model	engines	seats	speed	engine
0	N10156	2004.0	Fixed wing multi engine	EMBRAER	EMB-145XR	2	60	NaN	Turbo-fan
1	N102UW	1998.0	Fixed wing multi engine	AIRBUS INDUSTRIE	A320-214	2	182	NaN	Turbo-fan
2	N103US	1999.0	Fixed wing multi engine	AIRBUS INDUSTRIE	A320-214	2	182	NaN	Turbo-fan
3	N104UW	1999.0	Fixed wing multi engine	AIRBUS INDUSTRIE	A320-214	2	182	NaN	Turbo-fan
4	N10575	2002.0	Fixed wing multi engine	EMBRAER	EMB-145LR	2	55	NaN	Turbo-fan
5	N105UW	1999.0	Fixed wing multi engine	AIRBUS INDUSTRIE	A320-214	2	182	NaN	Turbo-fan

	tailnum	year	type	manufacturer	model	engines	seats	speed	engine
0	N10156	2004.0	Fixed wing multi engine	EMBRAER	EMB-145XR	2	60	NaN	Turbo-fan
2	N103US	1999.0	Fixed wing multi engine	AIRBUS INDUSTRIE	A320-214	2	182	NaN	Turbo-fan
4	N10575	2002.0	Fixed wing multi engine	EMBRAER	EMB-145LR	2	55	NaN	Turbo-fan
6	N107US	1999.0	Fixed wing multi engine	AIRBUS INDUSTRIE	A320-214	2	182	NaN	Turbo-fan
8	N109UW	1999.0	Fixed wing multi engine	AIRBUS INDUSTRIE	A320-214	2	182	NaN	Turbo-fan

	tailnum	year	type	manufacturer	model	engines	seats	speed	engine
0	N10156	2004.0	Fixed wing multi engine	EMBRAER	EMB-145XR	2	60	NaN	Turbo-fan
4	N10575	2002.0	Fixed wing multi engine	EMBRAER	EMB-145LR	2	55	NaN	Turbo-fan
10	N11106	2002.0	Fixed wing multi engine	EMBRAER	EMB-145XR	2	55	NaN	Turbo-fan
11	N11107	2002.0	Fixed wing multi engine	EMBRAER	EMB-145XR	2	55	NaN	Turbo-fan
12	N11109	2002.0	Fixed wing multi engine	EMBRAER	EMB-145XR	2	55	NaN	Turbo-fan

Intro to Pandas¶

Learning Objectives¶

Data Frames and Series¶

Data Frames¶

Series¶

Indices¶

Subsetting dimensions¶

Subsetting variables¶

Subsetting rows¶

Slicing rows¶

Filtering rows¶

Renaming columns¶

Calculations using columns¶

Adding & removing columns¶

Calculating based on multiple columns¶

Non-numeric column operations¶

Missing values¶

Applying custom functions¶

Summarizing a Data Frame¶

The Aggregation method¶

Grouped aggregation¶

Groups as index vs. variables¶

Grouping by multiple variables¶

Questions¶

	tailnum	year	type	manufacturer	model	engines	seats	speed	engine
15	N11121	2003.0	Fixed wing multi engine	EMBRAER	EMB-145XR	2	55	NaN	Turbo-fan
16	N11127	2003.0	Fixed wing multi engine	EMBRAER	EMB-145XR	2	55	NaN	Turbo-fan
17	N11137	2003.0	Fixed wing multi engine	EMBRAER	EMB-145XR	2	55	NaN	Turbo-fan
18	N11140	2003.0	Fixed wing multi engine	EMBRAER	EMB-145XR	2	55	NaN	Turbo-fan
19	N11150	2003.0	Fixed wing multi engine	EMBRAER	EMB-145XR	2	55	NaN	Turbo-fan

	Order	PID	MS SubClass	MS Zoning	Lot Frontage	Lot Area	Street	Alley	Lot Shape	Land Contour	...	Pool QC	Fence	Misc Feature	Misc Val	Mo Sold	Yr Sold	Sale Type	Sale Condition	SalePrice
0	1	526301100	20	RL	141.0	31770	Pave	NaN	IR1	Lvl	...	NaN	NaN	NaN	0	5	2010	WD	Normal	215000
1	2	526350040	20	RH	80.0	11622	Pave	NaN	Reg	Lvl	...	NaN	MnPrv	NaN	0	6	2010	WD	Normal	105000
2	3	526351010	20	RL	81.0	14267	Pave	NaN	IR1	Lvl	...	NaN	NaN	Gar2	12500	6	2010	WD	Normal	172000
3	4	526353030	20	RL	93.0	11160	Pave	NaN	Reg	Lvl	...	NaN	NaN	NaN	0	4	2010	WD	Normal	244000
4	5	527105010	60	RL	74.0	13830	Pave	NaN	IR1	Lvl	...	NaN	MnPrv	NaN	0	3	2010	WD	Normal	189900

	order	pid	ms subclass	ms zoning	lot frontage	lot area	street	alley	lot shape	land contour	...	pool qc	fence	misc feature	misc val	mo sold	yr sold	sale type	sale condition	saleprice	price_per_sqft
0	False	False	False	False	False	False	False	True	False	False	...	True	True	True	False	False	False	False	False	False	False
1	False	False	False	False	False	False	False	True	False	False	...	True	False	True	False	False	False	False	False	False	False
2	False	False	False	False	False	False	False	True	False	False	...	True	True	False	False	False	False	False	False	False	False
3	False	False	False	False	False	False	False	True	False	False	...	True	True	True	False	False	False	False	False	False	False
4	False	False	False	False	False	False	False	True	False	False	...	True	False	True	False	False	False	False	False	False	False

	lot frontage	alley	mas vnr type	mas vnr area	bsmt qual	bsmt cond	bsmt exposure	bsmtfin type 1	bsmtfin sf 1	bsmtfin type 2	...	garage type	garage yr blt	garage finish	garage cars	garage area	garage qual	garage cond	pool qc	fence	misc feature
0	141.0	NaN	Stone	112.0	TA	Gd	Gd	BLQ	639.0	Unf	...	Attchd	1960.0	Fin	2.0	528.0	TA	TA	NaN	NaN	NaN
1	80.0	NaN	None	0.0	TA	TA	No	Rec	468.0	LwQ	...	Attchd	1961.0	Unf	1.0	730.0	TA	TA	NaN	MnPrv	NaN
2	81.0	NaN	BrkFace	108.0	TA	TA	No	ALQ	923.0	Unf	...	Attchd	1958.0	Unf	1.0	312.0	TA	TA	NaN	NaN	Gar2
3	93.0	NaN	None	0.0	TA	TA	No	ALQ	1065.0	Unf	...	Attchd	1968.0	Fin	2.0	522.0	TA	TA	NaN	NaN	NaN
4	74.0	NaN	None	0.0	Gd	TA	No	GLQ	791.0	Unf	...	Attchd	1997.0	Fin	2.0	482.0	TA	TA	NaN	MnPrv	NaN

	gr liv area	garage area	lot area
0	153.847368	49.052784	2951.528310
1	83.241088	67.819190	1079.718666
2	123.468087	28.985736	1325.447101
3	196.025330	48.495366	1036.797480
4	151.338987	44.779246	1284.848490

	order	pid	ms subclass	lot frontage	lot area	overall qual	overall cond	year built	year remod/add	mas vnr area	...	open porch sf	enclosed porch	3ssn porch	screen porch	pool area	misc val	mo sold	yr sold	saleprice	price_per_sqft
count	2930.00000	2.930000e+03	2930.000000	2440.000000	2930.000000	2930.000000	2930.000000	2930.000000	2930.000000	2907.000000	...	2930.000000	2930.000000	2930.000000	2930.000000	2930.000000	2930.000000	2930.000000	2930.000000	2930.000000	2930.000000
mean	1465.50000	7.144645e+08	57.387372	69.224590	10147.921843	6.094881	5.563140	1971.356314	1984.266553	101.896801	...	47.533447	23.011604	2.592491	16.002048	2.243345	50.635154	6.216041	2007.790444	180796.060068	121.303619
std	845.96247	1.887308e+08	42.638025	23.365335	7880.017759	1.411026	1.111537	30.245361	20.860286	179.112611	...	67.483400	64.139059	25.141331	56.087370	35.597181	566.344288	2.714492	1.316613	79886.692357	32.090488
min	1.00000	5.263011e+08	20.000000	21.000000	1300.000000	1.000000	1.000000	1872.000000	1950.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	2006.000000	12789.000000	15.371394
25%	733.25000	5.284770e+08	20.000000	58.000000	7440.250000	5.000000	5.000000	1954.000000	1965.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	4.000000	2007.000000	129500.000000	99.871575
50%	1465.50000	5.354536e+08	50.000000	68.000000	9436.500000	6.000000	5.000000	1973.000000	1993.000000	0.000000	...	27.000000	0.000000	0.000000	0.000000	0.000000	0.000000	6.000000	2008.000000	160000.000000	120.230070
75%	2197.75000	9.071811e+08	70.000000	80.000000	11555.250000	7.000000	6.000000	2001.000000	2004.000000	164.000000	...	70.000000	0.000000	0.000000	0.000000	0.000000	0.000000	8.000000	2009.000000	213500.000000	139.885283
max	2930.00000	1.007100e+09	190.000000	313.000000	215245.000000	10.000000	9.000000	2010.000000	2010.000000	1600.000000	...	742.000000	1012.000000	508.000000	576.000000	800.000000	17000.000000	12.000000	2010.000000	755000.000000	276.250881

	saleprice	gr liv area
mean	180796.060068	1499.690444
median	160000.000000	NaN
min	NaN	334.000000

	saleprice
	mean	median
neighborhood
Blmngtn	196661.678571	191500.0
Blueste	143590.000000	130500.0
BrDale	105608.333333	106000.0
BrkSide	124756.250000	126750.0
ClearCr	208662.090909	197500.0

		saleprice
neighborhood	yr sold
Blmngtn	2006	214424.454545
	2007	194671.500000
	2008	190714.400000
	2009	177266.666667
	2010	176000.000000
...	...	...
Timber	2010	224947.625000
Veenker	2006	270000.000000
	2007	253577.777778
	2008	225928.571429
	2009	253962.500000

	A	B	C	D
0	3.0	2.0	NaN	0
1	3.0	4.0	NaN	1
2	3.0	3.0	NaN	5
3	3.0	3.0	NaN	4

	A	B	C	D
0	NaN	2.0	NaN	0
1	3.0	4.0	NaN	1
2	3.0	4.0	NaN	5
3	3.0	3.0	NaN	4