Chapter 8: Customizing Functions

Contents

Chapter 8: Customizing Functions#

Apply#

This instance happens when you want to perform more complex calculations on your data rather than addind or substracting two columns.

Things of such kind might prevent you from wrting a for loop to iterate over a whole column for instance

import pandas as pd

def my_function():
    # indent 4 spaces
    # function code
    pass

def my_sq(x):
    """squares a given value
    """
    return x ** 2

my_sq(2)

my_sq(4)

def avg_2(x, y):
    """calculates average between 2 numbers
    """
    return (x + y) / 2.0

avg_2(10, 20)

15.0

df1 = pd.DataFrame({
        "A": [5,10,15],
        "B": [3,6,9]
    })

df1

	A	B
0	5	3
1	10	6
2	15	9

df1['A'].apply(my_sq)

   25
  100
  225
Name: A, dtype: int64

df1['A'] ** 2

   25
  100
  225
Name: A, dtype: int64

Now we want to understand how an apply function works on a dataframe

def print_me(x):
    print(x)
df1

	A	B
0	5	3
1	10	6
2	15	9

df1.apply(print_me)

   5
  10
  15
Name: A, dtype: int64
  3
  6
  9
Name: B, dtype: int64

A    None
B    None
dtype: object

def avg_3(x, y,z):
    """avg of 3 numbers
    """
    return (x + y +z ) / 3

df1

	A	B
0	5	3
1	10	6
2	15	9

df1.apply(avg_3)

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-16-c7586a24e299> in <module>
----> 1 df1.apply(avg_3)

~/.local/lib/python3.9/site-packages/pandas/core/frame.py in apply(self, func, axis, raw, result_type, args, **kwargs)
   9421             kwargs=kwargs,
   9422         )
-> 9423         return op.apply().__finalize__(self, method="apply")
   9424 
   9425     def applymap(

~/.local/lib/python3.9/site-packages/pandas/core/apply.py in apply(self)
    676             return self.apply_raw()
    677 
--> 678         return self.apply_standard()
    679 
    680     def agg(self):

~/.local/lib/python3.9/site-packages/pandas/core/apply.py in apply_standard(self)
    796 
    797     def apply_standard(self):
--> 798         results, res_index = self.apply_series_generator()
    799 
    800         # wrap results

~/.local/lib/python3.9/site-packages/pandas/core/apply.py in apply_series_generator(self)
    812             for i, v in enumerate(series_gen):
    813                 # ignore SettingWithCopy here in case the user mutates
--> 814                 results[i] = self.f(v)
    815                 if isinstance(results[i], ABCSeries):
    816                     # If we have a view on v, we need to make a copy because

TypeError: avg_3() missing 2 required positional arguments: 'y' and 'z'

def Avg_3(col):
    """perform the average over a dataframe"""
    x = col[0]
    y = col[1]
    z = col[2]
    return (x + y + z) / 3

df1.apply(Avg_3)

df1

df1.apply(Avg_3, axis=0)

def avg_2(row):
    """avg of 2 numbers
    """
    x = row[0]
    y = row[1]
    return (x + y) / 2
df1.apply(avg_2, axis=1)

import seaborn as sns

titanic = sns.load_dataset('titanic')

titanic.head()

titanic.info()

import numpy as np

Write a function which is going to return a count of missing values column-wise

def count_missing(vec):
    null_vec = pd.isnull(vec)
    null_count = np.sum(null_vec)
    return null_count

titanic.apply(count_missing)

titanic.apply(count_missing, axis=1).value_counts() #549 rows have 1 missing values 182 have 0 missing value

2. Write a function which is going to return a proportion of the missing values

#A proportion is just what I took over the size of the whole thing

def prop_missing(vec):
    num = count_missing(vec)
    den = vec.size
    return num / den

titanic.apply(prop_missing)

 3.  Write a function which  computes the remaining proportion

def prop_complete(vec):
    return 1 - prop_missing(vec)

titanic.apply(prop_missing)

#titanic.loc[pd.isnull(titanic['embark_town']), :]