Skip to content

Model Documentation

churn_rate_by_risk_level(rfm_table)

Calculate churn rates by risk level based on customer churn risk classification.

Parameters:

Name Type Description Default
rfm_table DataFrame

Data with churn risk levels.

required

Returns:

Name Type Description
DataFrame DataFrame

Churn rate percentages by churn risk level.

Source code in CRR/model/model.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def churn_rate_by_risk_level(rfm_table: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate churn rates by risk level based on customer churn risk classification.

    Parameters:
        rfm_table (DataFrame): Data with churn risk levels.

    Returns:
        DataFrame: Churn rate percentages by churn risk level.
    """    

    risk_level_summary = rfm_table.groupby('ChurnRiskLevel').agg({
        'CustomerID': 'count'
    }).rename(columns={'CustomerID': 'Count'})
    total_customers = rfm_table['CustomerID'].count()
    risk_level_summary['ChurnRate'] = (risk_level_summary['Count'] / total_customers) * 100
    risk_level_summary['ChurnRate'] = risk_level_summary['ChurnRate'].round(2)

    return risk_level_summary[['ChurnRate']]

classify_churn_risk(rfm_table)

Classify customers into churn risk levels based on their RFM scores.

Parameters:

Name Type Description Default
rfm_table DataFrame

Data with RFM scores.

required

Returns:

Name Type Description
DataFrame DataFrame

Data updated with a 'ChurnRiskLevel' column.

Source code in CRR/model/model.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
def classify_churn_risk(rfm_table: pd.DataFrame) -> pd.DataFrame:
    """
    Classify customers into churn risk levels based on their RFM scores.

    Parameters:
        rfm_table (DataFrame): Data with RFM scores.

    Returns:
        DataFrame: Data updated with a 'ChurnRiskLevel' column.
    """    

    conditions = [
        (rfm_table['R_Score'] <= 2) & (rfm_table['F_Score'] <= 2) & (rfm_table['M_Score'] <= 2),
        (rfm_table['R_Score'] >= 3)
    ]
    # Labels for risk levels
    values = ['High Risk', 'Low Risk']
    default_value = 'Medium Risk'

    # Create a new column for churn risk level
    rfm_table['ChurnRiskLevel'] = np.select(conditions, values, default=default_value)
    return rfm_table

get_clusters(df)

Apply K-Means clustering to customer data based on RFM metrics and save the results.

Parameters:

Name Type Description Default
df DataFrame

Customer data with RFM metrics.

required

Returns:

Name Type Description
DataFrame DataFrame

Updated customer data including cluster assignments.

Source code in CRR/model/model.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def get_clusters(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply K-Means clustering to customer data based on RFM metrics and save the results.

    Parameters:
        df (DataFrame): Customer data with RFM metrics.

    Returns:
        DataFrame: Updated customer data including cluster assignments.
    """

    imputer = SimpleImputer(strategy='mean')
    rfm = df[['Recency', 'Frequency', 'Monetary']]
    rfm_imputed = imputer.fit_transform(rfm)

    # Standardize the data
    scaler = StandardScaler()
    rfm_scaled = scaler.fit_transform(rfm_imputed)

    for k in range(3, 7):
        kmeans = KMeans(n_clusters=k, random_state=1)
        df['Cluster'] = kmeans.fit_predict(rfm_scaled)
        cluster_summary = df.groupby('Cluster').agg({
            'Recency': 'mean',
            'Frequency': 'mean',
            'Monetary': ['mean', 'count']
        }).round(2)
        print(f"Cluster Summary for {k} Clusters:")
        print(cluster_summary)
        print("\n")


    k_selected = 5 
    kmeans = KMeans(n_clusters=k_selected, random_state=1)
    df['Cluster'] = kmeans.fit_predict(rfm_scaled)
    df = df[['CustomerID','Recency','Frequency','Monetary','R_Score','F_Score','M_Score','RFM_Score','Cluster']]
    df.to_csv("data/Customer_RFM_Clusters.csv", index=False)
    return df

get_rfm(customers_df, products_df, orders_df)

Calculate Recency, Frequency, and Monetary values for each customer and assign RFM scores.

Parameters:

Name Type Description Default
customers_df DataFrame

Contains customer data with at least 'CustomerID'.

required
products_df DataFrame

Contains product data with at least 'ProductID'.

required
orders_df DataFrame

Contains order data with 'OrderDate', 'CustomerID', 'ProductID', 'Price', and 'Quantity'.

required

Returns:

Name Type Description
DataFrame DataFrame

A DataFrame with each customer's RFM scores and metrics, merged with the customer information.

Source code in CRR/model/model.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def get_rfm(customers_df: pd.DataFrame, products_df: pd.DataFrame, orders_df: pd.DataFrame) -> pd.DataFrame:

    """
    Calculate Recency, Frequency, and Monetary values for each customer and assign RFM scores.

    Parameters:
        customers_df (DataFrame): Contains customer data with at least 'CustomerID'.
        products_df (DataFrame): Contains product data with at least 'ProductID'.
        orders_df (DataFrame): Contains order data with 'OrderDate', 'CustomerID', 'ProductID', 'Price', and 'Quantity'.

    Returns:
        DataFrame: A DataFrame with each customer's RFM scores and metrics, merged with the customer information.
    """

    # Ensure correct data types
    orders_df['OrderDate'] = pd.to_datetime(orders_df['OrderDate'], errors='coerce')
    orders_df.dropna(subset=['OrderDate'], inplace=True)  # Handle missing dates

    # Join orders with customers and products
    orders_df = orders_df.merge(customers_df, on="CustomerID", how='inner')
    orders_df = orders_df.merge(products_df, on="ProductID", how='inner')

    # Calculate total price for each order
    orders_df['TotalPrice'] = orders_df['Price'] * orders_df['Quantity']

    # Calculate Recency, Frequency, and Monetary values
    current_date = datetime.now()
    rfm_table = orders_df.groupby('CustomerID').agg({
        'OrderDate': lambda x: (current_date - x.max()).days,
        'OrderID': 'count',
        'TotalPrice': 'sum'
    }).rename(columns={'OrderDate': 'Recency', 'OrderID': 'Frequency', 'TotalPrice': 'Monetary'})

    # Calculate RFM Score
    rfm_table['R_Score'] = pd.qcut(rfm_table['Recency'], 4, labels=range(4, 0, -1))
    rfm_table['F_Score'] = pd.qcut(rfm_table['Frequency'], 4, labels=range(1, 5))
    rfm_table['M_Score'] = pd.qcut(rfm_table['Monetary'], 4, labels=range(1, 5))
    rfm_table['RFM_Score'] = rfm_table['R_Score'].astype(str) + rfm_table['F_Score'].astype(str) + rfm_table['M_Score'].astype(str)

    # Merge RFM score back to the customer DataFrame
    customer_rfm = customers_df.merge(rfm_table, on='CustomerID', how='inner')
    return customer_rfm