Cluster Likert Questions

[19]:
# 03_cluster_likert_questions.ipynb
import pandas as pd
import numpy as np
from pandas_survey_toolkit import nlp
from pandas_survey_toolkit.vis import cluster_heatmap_plot

# Create sample survey data with Likert scale responses
# Let's simulate a product satisfaction survey with 20 respondents and 8 Likert questions

# Define our questions
questions = [
    'q1_ease_of_use',
    'q2_product_quality',
    'q3_value_for_money',
    'q4_customer_service',
    'q5_would_recommend',
    'q6_meets_expectations',
    'q7_better_than_competitors',
    'q8_overall_satisfaction'
]

# Define our Likert scale options
likert_options = [
    'Strongly Disagree',
    'Disagree',
    'Neither Agree nor Disagree',
    'Agree',
    'Strongly Agree'
]

POPULATION = 200
# Create DataFrame with 20 respondents
np.random.seed(42)
data = {'respondent_id': range(1, POPULATION)}

# Generate random Likert responses with some patterns
# Group 1 (respondents 1-7): Generally positive
# Group 2 (respondents 8-14): Generally negative
# Group 3 (respondents 15-20): Mixed responses

for q in questions:
    responses = []
    for i in range(1, POPULATION):
        if i <= (0.3 * POPULATION):  # Positive group
            responses.append(np.random.choice(likert_options[2:], p=[0.1, 0.5, 0.4]))
        elif i <= (0.6 * POPULATION):  # Negative group
            responses.append(np.random.choice(likert_options[:3], p=[0.3, 0.5, 0.2]))
        else:  # Don't care group
            responses.append(np.random.choice(likert_options[1:4], p=[0.1,0.8,0.1]))
    data[q] = responses

# Create DataFrame
df = pd.DataFrame(data)

# Display the original data
print("Original survey data:")
display(df.head())

# Define custom mapping for Likert scale values
custom_mapping = {
    'strongly disagree': -1,
    'disagree': -1,
    'neither agree nor disagree': 0,
    'agree': 1,
    'strongly agree': 1
}

Original survey data:
respondent_id q1_ease_of_use q2_product_quality q3_value_for_money q4_customer_service q5_would_recommend q6_meets_expectations q7_better_than_competitors q8_overall_satisfaction
0 1 Agree Strongly Agree Strongly Agree Agree Strongly Agree Neither Agree nor Disagree Neither Agree nor Disagree Agree
1 2 Strongly Agree Strongly Agree Strongly Agree Agree Agree Strongly Agree Strongly Agree Agree
2 3 Strongly Agree Neither Agree nor Disagree Agree Neither Agree nor Disagree Strongly Agree Agree Strongly Agree Strongly Agree
3 4 Agree Agree Strongly Agree Agree Strongly Agree Strongly Agree Strongly Agree Agree
4 5 Agree Strongly Agree Agree Agree Strongly Agree Agree Strongly Agree Agree
[20]:

# Use pandas method chaining to process the data df_processed = (df # Cluster the questions .cluster_questions( columns=questions, #likert_mapping=custom_mapping, default handles most cases umap_n_neighbors=15, hdbscan_min_cluster_size=15, cluster_selection_epsilon=0.35, ) ) # Get the list of encoded Likert columns likert_columns_with_prefix = [f"likert_encoded_{q}" for q in questions] # Display encoded data print("\nEncoded Likert data:") display(df_processed[['respondent_id'] + likert_columns_with_prefix].head()) # Display clustering results print("\nQuestion clustering results:") display(df_processed[['respondent_id', 'question_cluster_id', 'question_cluster_probability']].head()) # Use the cluster_heatmap_plot function to visualize cluster patterns print("\nCluster heatmap showing the sentiment distribution across questions:") heatmap = cluster_heatmap_plot( df=df_processed, x="question_cluster_id", # Cluster IDs as the x-axis y=likert_columns_with_prefix, # Encoded Likert columns to analyze max_width=30 # For better readability ) # Display the heatmap display(heatmap) # Let's also add a simple interpretation of the clusters cluster_summary = df_processed.groupby('question_cluster_id')[likert_columns_with_prefix].mean() print("\nCluster averages for each question:") display(cluster_summary) # Calculate respondent counts per cluster cluster_counts = df_processed['question_cluster_id'].value_counts().sort_index() print("\nNumber of respondents in each cluster:") display(cluster_counts)
Using default mapping:
-1: Phrases containing 'disagree', 'do not agree', etc.
 0: Phrases containing 'neutral', 'neither', 'unsure', etc.
+1: Phrases containing 'agree' (but not 'disagree' or 'not agree')
NaN: NaN values are preserved
  Agree -> 1: 282 times
  Strongly Agree -> 1: 199 times
  Neither Agree nor Disagree -> 0: 668 times
  Disagree -> -1: 293 times
  Strongly Disagree -> -1: 150 times

Encoded Likert data:
respondent_id likert_encoded_q1_ease_of_use likert_encoded_q2_product_quality likert_encoded_q3_value_for_money likert_encoded_q4_customer_service likert_encoded_q5_would_recommend likert_encoded_q6_meets_expectations likert_encoded_q7_better_than_competitors likert_encoded_q8_overall_satisfaction
0 1 1 1 1 1 1 0 0 1
1 2 1 1 1 1 1 1 1 1
2 3 1 0 1 0 1 1 1 1
3 4 1 1 1 1 1 1 1 1
4 5 1 1 1 1 1 1 1 1

Question clustering results:
respondent_id question_cluster_id question_cluster_probability
0 1 0.0 0.570234
1 2 0.0 1.000000
2 3 0.0 0.562724
3 4 0.0 1.000000
4 5 0.0 1.000000

Cluster heatmap showing the sentiment distribution across questions:

Cluster averages for each question:
likert_encoded_q1_ease_of_use likert_encoded_q2_product_quality likert_encoded_q3_value_for_money likert_encoded_q4_customer_service likert_encoded_q5_would_recommend likert_encoded_q6_meets_expectations likert_encoded_q7_better_than_competitors likert_encoded_q8_overall_satisfaction
question_cluster_id
0.0 0.852459 0.885246 0.819672 0.868852 0.918033 0.852459 0.885246 0.885246
1.0 0.040816 -0.081633 0.122449 0.081633 0.081633 -0.040816 0.183673 0.000000
2.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
3.0 -0.680556 -0.694444 -0.569444 -0.708333 -0.833333 -0.666667 -0.777778 -0.708333

Number of respondents in each cluster:
question_cluster_id
0.0    61
1.0    49
2.0    17
3.0    72
Name: count, dtype: int64
[ ]:

[ ]: