Chapter 8: Customizing Functions#

Apply#

This instance happens when you want to perform more complex calculations on your data rather than addind or substracting two columns.

Things of such kind might prevent you from wrting a for loop to iterate over a whole column for instance

import pandas as pd
def my_function():
    # indent 4 spaces
    # function code
    pass
def my_sq(x):
    """squares a given value
    """
    return x ** 2
my_sq(2)
4
my_sq(4)
16
def avg_2(x, y):
    """calculates average between 2 numbers
    """
    return (x + y) / 2.0
avg_2(10, 20)
15.0
df1 = pd.DataFrame({
        "A": [5,10,15],
        "B": [3,6,9]
    })
df1
A B
0 5 3
1 10 6
2 15 9
df1['A'].apply(my_sq)
0     25
1    100
2    225
Name: A, dtype: int64
df1['A'] ** 2
0     25
1    100
2    225
Name: A, dtype: int64

Now we want to understand how an apply function works on a dataframe

def print_me(x):
    print(x)
df1
A B
0 5 3
1 10 6
2 15 9
df1.apply(print_me)
0     5
1    10
2    15
Name: A, dtype: int64
0    3
1    6
2    9
Name: B, dtype: int64
A    None
B    None
dtype: object
def avg_3(x, y,z):
    """avg of 3 numbers
    """
    return (x + y +z ) / 3
df1
A B
0 5 3
1 10 6
2 15 9
df1.apply(avg_3)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-16-c7586a24e299> in <module>
----> 1 df1.apply(avg_3)

~/.local/lib/python3.9/site-packages/pandas/core/frame.py in apply(self, func, axis, raw, result_type, args, **kwargs)
   9421             kwargs=kwargs,
   9422         )
-> 9423         return op.apply().__finalize__(self, method="apply")
   9424 
   9425     def applymap(

~/.local/lib/python3.9/site-packages/pandas/core/apply.py in apply(self)
    676             return self.apply_raw()
    677 
--> 678         return self.apply_standard()
    679 
    680     def agg(self):

~/.local/lib/python3.9/site-packages/pandas/core/apply.py in apply_standard(self)
    796 
    797     def apply_standard(self):
--> 798         results, res_index = self.apply_series_generator()
    799 
    800         # wrap results

~/.local/lib/python3.9/site-packages/pandas/core/apply.py in apply_series_generator(self)
    812             for i, v in enumerate(series_gen):
    813                 # ignore SettingWithCopy here in case the user mutates
--> 814                 results[i] = self.f(v)
    815                 if isinstance(results[i], ABCSeries):
    816                     # If we have a view on v, we need to make a copy because

TypeError: avg_3() missing 2 required positional arguments: 'y' and 'z'
def Avg_3(col):
    """perform the average over a dataframe"""
    x = col[0]
    y = col[1]
    z = col[2]
    return (x + y + z) / 3
df1.apply(Avg_3)
df1
df1.apply(Avg_3, axis=0)
def avg_2(row):
    """avg of 2 numbers
    """
    x = row[0]
    y = row[1]
    return (x + y) / 2
df1.apply(avg_2, axis=1)
import seaborn as sns

titanic = sns.load_dataset('titanic')
titanic.head()
titanic.info()
import numpy as np
  1. Write a function which is going to return a count of missing values column-wise

def count_missing(vec):
    null_vec = pd.isnull(vec)
    null_count = np.sum(null_vec)
    return null_count
titanic.apply(count_missing)
titanic.apply(count_missing, axis=1).value_counts() #549 rows have 1 missing values 182 have 0 missing value
2. Write a function which is going to return a proportion of the missing values
#A proportion is just what I took over the size of the whole thing
def prop_missing(vec):
    num = count_missing(vec)
    den = vec.size
    return num / den
titanic.apply(prop_missing)
 3.  Write a function which  computes the remaining proportion
def prop_complete(vec):
    return 1 - prop_missing(vec)
titanic.apply(prop_missing)
#titanic.loc[pd.isnull(titanic['embark_town']), :]