18 Robocalls Consumer Protections

Simulates a dataset to educate on safe customer phone services, raise awareness against AI/robocall harassment by corporations, and help optimize compliant calling practices. Based on adult dataset stats with added legal/compliance features. Features

Customer demographics (age, education, marital, occupation, etc.) Economic indicators (CPI, CCI, irate, employment) Call details (robot/human calls, 800 number) Location/legal (countries, cities, states, GDPR) Purchase outcome with correlations Usage

Run simulation.py to generate simulated_customer_robocall_dataset.csv. Load in data analysis tools (e.g., R/Python) for modeling. Data Generation

Size: ~48k rows Correlations: Purchase influenced by age, education, cap_gain, robot_calling (negative) Missing values: Simulated in edu_years, housing, job, marital

18.1 GDPR Section

GDPR (General Data Protection Regulation) applies to personal data processing for EU residents or by EU-based entities. For robocalls, it requires explicit consent for automated marketing calls. Non-compliance can lead to fines up to 4% of global turnover. In this dataset, ‘GDPR_applies’ flags cases where GDPR rules must be followed, helping model compliant practices vs. potential harassment in non-regulated areas.

18.2 Deepfake Concerns

Modern robocalls may use deepfake AI voices, mimicking humans so convincingly that recipients can’t tell if it’s automated. This blurs lines between robot/human calls, enabling manipulation/scams. The dataset’s ‘robot_calling’ flag simulates this; in reality, detection tools are needed. Awareness helps consumers report suspicious calls to authorities like FCC/FTC.

# Load needed libraries.
library(ggplot2)
library(dplyr)
library(tidyr)
library(caret)
knitr::opts_chunk$set(warning = FALSE, message = FALSE, fig.width = 10, fig.height = 6)
reticulate::use_python("C:/Users/casti/AppData/Local/Programs/Python/Python313/python.exe", required = TRUE)
# 3. Restart R session inside RStudio
Sys.setenv(RETICULATE_UV_ENABLED = "0")

#.rs.restartR()

library(reticulate)
datasets <- import("datasets")
ds <- datasets$load_dataset("supersam7/robocalls_consumer_protections")
df <- as_tibble(ds["train"]$to_pandas())
summary(df)
##       age        education_num   marital_status      occupation       
##  Min.   :17.00   Min.   : 1.00   Length:48842       Length:48842      
##  1st Qu.:31.00   1st Qu.: 8.00   Class :character   Class :character  
##  Median :39.00   Median :10.00   Mode  :character   Mode  :character  
##  Mean   :39.82   Mean   : 9.51                                        
##  3rd Qu.:48.00   3rd Qu.:11.00                                        
##  Max.   :89.00   Max.   :15.00                                        
##     cap_gain     hours_per_week      score        value_flag       
##  Min.   :    0   Min.   : 1.00   Min.   :44.89   Length:48842      
##  1st Qu.:  310   1st Qu.:31.00   1st Qu.:57.55   Class :character  
##  Median :  744   Median :40.00   Median :60.23   Mode  :character  
##  Mean   : 1075   Mean   :39.93   Mean   :60.24                     
##  3rd Qu.: 1488   3rd Qu.:48.00   3rd Qu.:62.95                     
##  Max.   :11599   Max.   :98.00   Max.   :75.37                     
##  country_of_business country_of_customer state_of_customer  city_of_customer  
##  Length:48842        Length:48842        Length:48842       Length:48842      
##  Class :character    Class :character    Class :character   Class :character  
##  Mode  :character    Mode  :character    Mode  :character   Mode  :character  
##                                                                               
##                                                                               
##                                                                               
##   GDPR_applies    robot_calling   number_of_robot_calls number_of_human_calls
##  Min.   :0.0000   Min.   :0.000   Min.   : 0.0000       Min.   : 0.000       
##  1st Qu.:1.0000   1st Qu.:0.000   1st Qu.: 0.0000       1st Qu.: 2.000       
##  Median :1.0000   Median :0.000   Median : 0.0000       Median : 3.000       
##  Mean   :0.7518   Mean   :0.296   Mean   : 0.8822       Mean   : 2.993       
##  3rd Qu.:1.0000   3rd Qu.:1.000   3rd Qu.: 2.0000       3rd Qu.: 4.000       
##  Max.   :1.0000   Max.   :1.000   Max.   :10.0000       Max.   :12.000       
##  800_number_or_not   housing              loan            phone_type       
##  Min.   :0.0000    Length:48842       Length:48842       Length:48842      
##  1st Qu.:0.0000    Class :character   Class :character   Class :character  
##  Median :1.0000    Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.7028                                                            
##  3rd Qu.:1.0000                                                            
##  Max.   :1.0000                                                            
##     month             weekday               CPI             CCI        
##  Length:48842       Length:48842       Min.   :92.20   Min.   :-50.80  
##  Class :character   Class :character   1st Qu.:93.18   1st Qu.:-43.65  
##  Mode  :character   Mode  :character   Median :93.57   Median :-40.39  
##                                        Mean   :93.56   Mean   :-40.32  
##                                        3rd Qu.:93.96   3rd Qu.:-37.11  
##                                        Max.   :94.77   Max.   :-26.91  
##      irate          employment      purchase     
##  Min.   :0.6481   Min.   :4969   Min.   :0.0000  
##  1st Qu.:2.8917   1st Qu.:5128   1st Qu.:0.0000  
##  Median :3.5226   Median :5159   Median :0.0000  
##  Mean   :3.4693   Mean   :5156   Mean   :0.3425  
##  3rd Qu.:4.1209   3rd Qu.:5188   3rd Qu.:1.0000  
##  Max.   :5.0450   Max.   :5227   Max.   :1.0000
# Fixed and improved plots for robot calls


# 1. Scatterplot of robot calls vs human calls (with transparency for density)
df |> 
  ggplot(aes(x = number_of_robot_calls, y = number_of_human_calls)) + 
  geom_point(alpha = 0.1, color = "#007bff") + 
  theme_minimal(base_size = 16) +
  labs(title = "Robot Calls vs Human Calls", x = "Number of Robot Calls", y = "Number of Human Calls") +
  theme(text = element_text(color = "#0056b3"))

# 2. Histogram of number of robot calls (fixed syntax, added bins)
df |> 
  ggplot(aes(x = number_of_robot_calls)) + 
  geom_histogram(binwidth = 1, fill = "#007bff", color = "#0056b3") + 
  theme_minimal(base_size = 16) +
  labs(title = "Distribution of Number of Robot Calls", x = "Number of Robot Calls", y = "Count") +
  theme(text = element_text(color = "#0056b3"))

# 3. % Robot Calling by Purchase (enhanced legend)
df |> 
  group_by(purchase) |> 
  summarise(pct_robot_calling = mean(robot_calling, na.rm = TRUE)) |> 
  ggplot(aes(x = factor(purchase, labels = c("No", "Yes")), y = pct_robot_calling)) + 
  geom_bar(stat = "identity", fill = "#007bff") + 
  theme_minimal(base_size = 16) +
  labs(title = "% Robot Calling by Purchase Outcome", x = "Purchase", y = "Proportion Robot Calls") +
  theme(text = element_text(color = "#0056b3"),
        legend.title = element_blank())

# 4. % Robot Calling by Marital Status (enhanced legend)
df |> 
  group_by(marital_status) |> 
  summarise(pct_robot_calling = mean(robot_calling, na.rm = TRUE)) |> 
  ggplot(aes(x = marital_status, y = pct_robot_calling)) + 
  geom_bar(stat = "identity", fill = "#007bff") + 
  coord_flip() + 
  theme_minimal(base_size = 16) +
  labs(title = "% Robot Calling by Marital Status", x = "Marital Status", y = "Proportion Robot Calls") +
  theme(text = element_text(color = "#0056b3"),
        legend.title = element_blank())

# 5. % Robot Calling by Occupation (enhanced legend)
df |> 
  group_by(occupation) |> 
  summarise(pct_robot_calling = mean(robot_calling, na.rm = TRUE)) |> 
  ggplot(aes(x = occupation, y = pct_robot_calling)) + 
  geom_bar(stat = "identity", fill = "#007bff") + 
  coord_flip() + 
  theme_minimal(base_size = 16) +
  labs(title = "% Robot Calling by Occupation Group", x = "Occupation", y = "Proportion Robot Calls") +
  theme(text = element_text(color = "#0056b3"),
        legend.title = element_blank())

# 6. Boxplot of Robot Calls by Purchase (enhanced legend)
df |> 
  ggplot(aes(x = factor(purchase, labels = c("No", "Yes")), y = number_of_robot_calls)) + 
  geom_boxplot(fill = "#007bff") + 
  theme_minimal(base_size = 16) +
  labs(title = "Number of Robot Calls by Purchase Outcome", x = "Purchase", y = "Number of Robot Calls") +
  theme(text = element_text(color = "#0056b3"),
        legend.title = element_blank())

# 7. % Robot Calling by GDPR (enhanced legend)
df |> 
  group_by(GDPR_applies) |> 
  summarise(pct_robot_calling = mean(robot_calling, na.rm = TRUE)) |> 
  ggplot(aes(x = factor(GDPR_applies, labels = c("No", "Yes")), y = pct_robot_calling)) + 
  geom_bar(stat = "identity", fill = "#dc3545") + 
  theme_minimal(base_size = 16) +
  labs(title = "% Robot Calling by GDPR Applicability", x = "GDPR Applies", y = "Proportion Robot Calls") +
  theme(text = element_text(color = "#0056b3"),
        legend.title = element_blank())

# 8. % Robot Calling by Age Group (enhanced legend)
df |> 
  mutate(age_bin = cut(age, breaks = seq(15, 95, by = 10))) |> 
  group_by(age_bin) |> 
  summarise(pct_robot_calling = mean(robot_calling, na.rm = TRUE)) |> 
  ggplot(aes(x = age_bin, y = pct_robot_calling)) + 
  geom_bar(stat = "identity", fill = "#007bff") + 
  theme_minimal(base_size = 16) +
  labs(title = "% Robot Calling by Age Group", x = "Age Group", y = "Proportion Robot Calls") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, color = "#0056b3"),
        legend.title = element_blank())

# New: % Robot Calling by Housing (added categorical)
df |> 
  group_by(housing) |> 
  summarise(pct_robot_calling = mean(robot_calling, na.rm = TRUE)) |> 
  ggplot(aes(x = housing, y = pct_robot_calling)) + 
  geom_bar(stat = "identity", fill = "#007bff") + 
  coord_flip() + 
  theme_minimal(base_size = 16) +
  labs(title = "% Robot Calling by Housing Loan", x = "Housing Loan", y = "Proportion Robot Calls") +
  theme(text = element_text(color = "#0056b3"),
        legend.title = element_blank())

# New: % Robot Calling by Loan (added categorical)
df |> 
  group_by(loan) |> 
  summarise(pct_robot_calling = mean(robot_calling, na.rm = TRUE)) |> 
  ggplot(aes(x = loan, y = pct_robot_calling)) + 
  geom_bar(stat = "identity", fill = "#007bff") + 
  coord_flip() + 
  theme_minimal(base_size = 16) +
  labs(title = "% Robot Calling by Consumer Loan", x = "Consumer Loan", y = "Proportion Robot Calls") +
  theme(text = element_text(color = "#0056b3"),
        legend.title = element_blank())

# New: % Robot Calling by Phone Type (added categorical)
df |> 
  group_by(phone_type) |> 
  summarise(pct_robot_calling = mean(robot_calling, na.rm = TRUE)) |> 
  ggplot(aes(x = phone_type, y = pct_robot_calling)) + 
  geom_bar(stat = "identity", fill = "#007bff") + 
  coord_flip() + 
  theme_minimal(base_size = 16) +
  labs(title = "% Robot Calling by Phone Type", x = "Phone Type", y = "Proportion Robot Calls") +
  theme(text = element_text(color = "#0056b3"),
        legend.title = element_blank())

# New: % Robot Calling by Weekday (added categorical)
df |> 
  group_by(weekday) |> 
  summarise(pct_robot_calling = mean(robot_calling, na.rm = TRUE)) |> 
  ggplot(aes(x = weekday, y = pct_robot_calling)) + 
  geom_bar(stat = "identity", fill = "#007bff") + 
  coord_flip() + 
  theme_minimal(base_size = 16) +
  labs(title = "% Robot Calling by Weekday", x = "Weekday", y = "Proportion Robot Calls") +
  theme(text = element_text(color = "#0056b3"),
        legend.title = element_blank())

# For months: Pct purchase as fill/color
df |> 
  group_by(month) |> 
  summarise(pct_purchase = mean(purchase, na.rm = TRUE)) |> 
  ggplot(aes(x = month, y = 1, fill = pct_purchase)) + 
  geom_bar(stat = "identity") + 
  scale_fill_gradient(low = "#6c757d", high = "#007bff") +  # Gray to blue
  theme_minimal(base_size = 16) +
  labs(title = "% Purchase by Month (Fill by Pct Purchase)", x = "Month", y = "Count (Fixed)", fill = "% Purchase") +
  theme(text = element_text(color = "#0056b3"),
        axis.text.x = element_text(angle = 45, hjust = 1))