Chapter 8: Customizing Functions#
Apply#
This instance happens when you want to perform more complex calculations on your data rather than addind or substracting two columns.
Things of such kind might prevent you from wrting a for loop to iterate over a whole column for instance
import pandas as pd
def my_function():
# indent 4 spaces
# function code
pass
def my_sq(x):
"""squares a given value
"""
return x ** 2
my_sq(2)
4
my_sq(4)
16
def avg_2(x, y):
"""calculates average between 2 numbers
"""
return (x + y) / 2.0
avg_2(10, 20)
15.0
df1 = pd.DataFrame({
"A": [5,10,15],
"B": [3,6,9]
})
df1
A | B | |
---|---|---|
0 | 5 | 3 |
1 | 10 | 6 |
2 | 15 | 9 |
df1['A'].apply(my_sq)
0 25
1 100
2 225
Name: A, dtype: int64
df1['A'] ** 2
0 25
1 100
2 225
Name: A, dtype: int64
Now we want to understand how an apply function works on a dataframe
def print_me(x):
print(x)
df1
A | B | |
---|---|---|
0 | 5 | 3 |
1 | 10 | 6 |
2 | 15 | 9 |
df1.apply(print_me)
0 5
1 10
2 15
Name: A, dtype: int64
0 3
1 6
2 9
Name: B, dtype: int64
A None
B None
dtype: object
def avg_3(x, y,z):
"""avg of 3 numbers
"""
return (x + y +z ) / 3
df1
A | B | |
---|---|---|
0 | 5 | 3 |
1 | 10 | 6 |
2 | 15 | 9 |
df1.apply(avg_3)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-16-c7586a24e299> in <module>
----> 1 df1.apply(avg_3)
~/.local/lib/python3.9/site-packages/pandas/core/frame.py in apply(self, func, axis, raw, result_type, args, **kwargs)
9421 kwargs=kwargs,
9422 )
-> 9423 return op.apply().__finalize__(self, method="apply")
9424
9425 def applymap(
~/.local/lib/python3.9/site-packages/pandas/core/apply.py in apply(self)
676 return self.apply_raw()
677
--> 678 return self.apply_standard()
679
680 def agg(self):
~/.local/lib/python3.9/site-packages/pandas/core/apply.py in apply_standard(self)
796
797 def apply_standard(self):
--> 798 results, res_index = self.apply_series_generator()
799
800 # wrap results
~/.local/lib/python3.9/site-packages/pandas/core/apply.py in apply_series_generator(self)
812 for i, v in enumerate(series_gen):
813 # ignore SettingWithCopy here in case the user mutates
--> 814 results[i] = self.f(v)
815 if isinstance(results[i], ABCSeries):
816 # If we have a view on v, we need to make a copy because
TypeError: avg_3() missing 2 required positional arguments: 'y' and 'z'
def Avg_3(col):
"""perform the average over a dataframe"""
x = col[0]
y = col[1]
z = col[2]
return (x + y + z) / 3
df1.apply(Avg_3)
df1
df1.apply(Avg_3, axis=0)
def avg_2(row):
"""avg of 2 numbers
"""
x = row[0]
y = row[1]
return (x + y) / 2
df1.apply(avg_2, axis=1)
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()
titanic.info()
import numpy as np
Write a function which is going to return a count of missing values column-wise
def count_missing(vec):
null_vec = pd.isnull(vec)
null_count = np.sum(null_vec)
return null_count
titanic.apply(count_missing)
titanic.apply(count_missing, axis=1).value_counts() #549 rows have 1 missing values 182 have 0 missing value
2. Write a function which is going to return a proportion of the missing values
#A proportion is just what I took over the size of the whole thing
def prop_missing(vec):
num = count_missing(vec)
den = vec.size
return num / den
titanic.apply(prop_missing)
3. Write a function which computes the remaining proportion
def prop_complete(vec):
return 1 - prop_missing(vec)
titanic.apply(prop_missing)
#titanic.loc[pd.isnull(titanic['embark_town']), :]