Cluster Likert Questions
[19]:
# 03_cluster_likert_questions.ipynb
import pandas as pd
import numpy as np
from pandas_survey_toolkit import nlp
from pandas_survey_toolkit.vis import cluster_heatmap_plot
# Create sample survey data with Likert scale responses
# Let's simulate a product satisfaction survey with 20 respondents and 8 Likert questions
# Define our questions
questions = [
'q1_ease_of_use',
'q2_product_quality',
'q3_value_for_money',
'q4_customer_service',
'q5_would_recommend',
'q6_meets_expectations',
'q7_better_than_competitors',
'q8_overall_satisfaction'
]
# Define our Likert scale options
likert_options = [
'Strongly Disagree',
'Disagree',
'Neither Agree nor Disagree',
'Agree',
'Strongly Agree'
]
POPULATION = 200
# Create DataFrame with 20 respondents
np.random.seed(42)
data = {'respondent_id': range(1, POPULATION)}
# Generate random Likert responses with some patterns
# Group 1 (respondents 1-7): Generally positive
# Group 2 (respondents 8-14): Generally negative
# Group 3 (respondents 15-20): Mixed responses
for q in questions:
responses = []
for i in range(1, POPULATION):
if i <= (0.3 * POPULATION): # Positive group
responses.append(np.random.choice(likert_options[2:], p=[0.1, 0.5, 0.4]))
elif i <= (0.6 * POPULATION): # Negative group
responses.append(np.random.choice(likert_options[:3], p=[0.3, 0.5, 0.2]))
else: # Don't care group
responses.append(np.random.choice(likert_options[1:4], p=[0.1,0.8,0.1]))
data[q] = responses
# Create DataFrame
df = pd.DataFrame(data)
# Display the original data
print("Original survey data:")
display(df.head())
# Define custom mapping for Likert scale values
custom_mapping = {
'strongly disagree': -1,
'disagree': -1,
'neither agree nor disagree': 0,
'agree': 1,
'strongly agree': 1
}
Original survey data:
| respondent_id | q1_ease_of_use | q2_product_quality | q3_value_for_money | q4_customer_service | q5_would_recommend | q6_meets_expectations | q7_better_than_competitors | q8_overall_satisfaction | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Agree | Strongly Agree | Strongly Agree | Agree | Strongly Agree | Neither Agree nor Disagree | Neither Agree nor Disagree | Agree |
| 1 | 2 | Strongly Agree | Strongly Agree | Strongly Agree | Agree | Agree | Strongly Agree | Strongly Agree | Agree |
| 2 | 3 | Strongly Agree | Neither Agree nor Disagree | Agree | Neither Agree nor Disagree | Strongly Agree | Agree | Strongly Agree | Strongly Agree |
| 3 | 4 | Agree | Agree | Strongly Agree | Agree | Strongly Agree | Strongly Agree | Strongly Agree | Agree |
| 4 | 5 | Agree | Strongly Agree | Agree | Agree | Strongly Agree | Agree | Strongly Agree | Agree |
[20]:
# Use pandas method chaining to process the data
df_processed = (df
# Cluster the questions
.cluster_questions(
columns=questions,
#likert_mapping=custom_mapping, default handles most cases
umap_n_neighbors=15,
hdbscan_min_cluster_size=15,
cluster_selection_epsilon=0.35,
)
)
# Get the list of encoded Likert columns
likert_columns_with_prefix = [f"likert_encoded_{q}" for q in questions]
# Display encoded data
print("\nEncoded Likert data:")
display(df_processed[['respondent_id'] + likert_columns_with_prefix].head())
# Display clustering results
print("\nQuestion clustering results:")
display(df_processed[['respondent_id', 'question_cluster_id', 'question_cluster_probability']].head())
# Use the cluster_heatmap_plot function to visualize cluster patterns
print("\nCluster heatmap showing the sentiment distribution across questions:")
heatmap = cluster_heatmap_plot(
df=df_processed,
x="question_cluster_id", # Cluster IDs as the x-axis
y=likert_columns_with_prefix, # Encoded Likert columns to analyze
max_width=30 # For better readability
)
# Display the heatmap
display(heatmap)
# Let's also add a simple interpretation of the clusters
cluster_summary = df_processed.groupby('question_cluster_id')[likert_columns_with_prefix].mean()
print("\nCluster averages for each question:")
display(cluster_summary)
# Calculate respondent counts per cluster
cluster_counts = df_processed['question_cluster_id'].value_counts().sort_index()
print("\nNumber of respondents in each cluster:")
display(cluster_counts)
Using default mapping:
-1: Phrases containing 'disagree', 'do not agree', etc.
0: Phrases containing 'neutral', 'neither', 'unsure', etc.
+1: Phrases containing 'agree' (but not 'disagree' or 'not agree')
NaN: NaN values are preserved
Agree -> 1: 282 times
Strongly Agree -> 1: 199 times
Neither Agree nor Disagree -> 0: 668 times
Disagree -> -1: 293 times
Strongly Disagree -> -1: 150 times
Encoded Likert data:
| respondent_id | likert_encoded_q1_ease_of_use | likert_encoded_q2_product_quality | likert_encoded_q3_value_for_money | likert_encoded_q4_customer_service | likert_encoded_q5_would_recommend | likert_encoded_q6_meets_expectations | likert_encoded_q7_better_than_competitors | likert_encoded_q8_overall_satisfaction | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 1 |
| 1 | 2 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 2 | 3 | 1 | 0 | 1 | 0 | 1 | 1 | 1 | 1 |
| 3 | 4 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 4 | 5 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
Question clustering results:
| respondent_id | question_cluster_id | question_cluster_probability | |
|---|---|---|---|
| 0 | 1 | 0.0 | 0.570234 |
| 1 | 2 | 0.0 | 1.000000 |
| 2 | 3 | 0.0 | 0.562724 |
| 3 | 4 | 0.0 | 1.000000 |
| 4 | 5 | 0.0 | 1.000000 |
Cluster heatmap showing the sentiment distribution across questions:
Cluster averages for each question:
| likert_encoded_q1_ease_of_use | likert_encoded_q2_product_quality | likert_encoded_q3_value_for_money | likert_encoded_q4_customer_service | likert_encoded_q5_would_recommend | likert_encoded_q6_meets_expectations | likert_encoded_q7_better_than_competitors | likert_encoded_q8_overall_satisfaction | |
|---|---|---|---|---|---|---|---|---|
| question_cluster_id | ||||||||
| 0.0 | 0.852459 | 0.885246 | 0.819672 | 0.868852 | 0.918033 | 0.852459 | 0.885246 | 0.885246 |
| 1.0 | 0.040816 | -0.081633 | 0.122449 | 0.081633 | 0.081633 | -0.040816 | 0.183673 | 0.000000 |
| 2.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3.0 | -0.680556 | -0.694444 | -0.569444 | -0.708333 | -0.833333 | -0.666667 | -0.777778 | -0.708333 |
Number of respondents in each cluster:
question_cluster_id
0.0 61
1.0 49
2.0 17
3.0 72
Name: count, dtype: int64
[ ]:
[ ]: