Cluster Likert Questions

[19]:

# 03_cluster_likert_questions.ipynb
import pandas as pd
import numpy as np
from pandas_survey_toolkit import nlp
from pandas_survey_toolkit.vis import cluster_heatmap_plot

# Create sample survey data with Likert scale responses
# Let's simulate a product satisfaction survey with 20 respondents and 8 Likert questions

# Define our questions
questions = [
    'q1_ease_of_use',
    'q2_product_quality',
    'q3_value_for_money',
    'q4_customer_service',
    'q5_would_recommend',
    'q6_meets_expectations',
    'q7_better_than_competitors',
    'q8_overall_satisfaction'
]

# Define our Likert scale options
likert_options = [
    'Strongly Disagree',
    'Disagree',
    'Neither Agree nor Disagree',
    'Agree',
    'Strongly Agree'
]

POPULATION = 200
# Create DataFrame with 20 respondents
np.random.seed(42)
data = {'respondent_id': range(1, POPULATION)}

# Generate random Likert responses with some patterns
# Group 1 (respondents 1-7): Generally positive
# Group 2 (respondents 8-14): Generally negative
# Group 3 (respondents 15-20): Mixed responses

for q in questions:
    responses = []
    for i in range(1, POPULATION):
        if i <= (0.3 * POPULATION):  # Positive group
            responses.append(np.random.choice(likert_options[2:], p=[0.1, 0.5, 0.4]))
        elif i <= (0.6 * POPULATION):  # Negative group
            responses.append(np.random.choice(likert_options[:3], p=[0.3, 0.5, 0.2]))
        else:  # Don't care group
            responses.append(np.random.choice(likert_options[1:4], p=[0.1,0.8,0.1]))
    data[q] = responses

# Create DataFrame
df = pd.DataFrame(data)

# Display the original data
print("Original survey data:")
display(df.head())

# Define custom mapping for Likert scale values
custom_mapping = {
    'strongly disagree': -1,
    'disagree': -1,
    'neither agree nor disagree': 0,
    'agree': 1,
    'strongly agree': 1
}

Original survey data:

	respondent_id	q1_ease_of_use	q2_product_quality	q3_value_for_money	q4_customer_service	q5_would_recommend	q6_meets_expectations	q7_better_than_competitors	q8_overall_satisfaction
0	1	Agree	Strongly Agree	Strongly Agree	Agree	Strongly Agree	Neither Agree nor Disagree	Neither Agree nor Disagree	Agree
1	2	Strongly Agree	Strongly Agree	Strongly Agree	Agree	Agree	Strongly Agree	Strongly Agree	Agree
2	3	Strongly Agree	Neither Agree nor Disagree	Agree	Neither Agree nor Disagree	Strongly Agree	Agree	Strongly Agree	Strongly Agree
3	4	Agree	Agree	Strongly Agree	Agree	Strongly Agree	Strongly Agree	Strongly Agree	Agree
4	5	Agree	Strongly Agree	Agree	Agree	Strongly Agree	Agree	Strongly Agree	Agree

[20]:

# Use pandas method chaining to process the data
df_processed = (df
    # Cluster the questions
    .cluster_questions(
        columns=questions,
        #likert_mapping=custom_mapping, default handles most cases
        umap_n_neighbors=15,
        hdbscan_min_cluster_size=15,
        cluster_selection_epsilon=0.35,

    )
)

# Get the list of encoded Likert columns
likert_columns_with_prefix = [f"likert_encoded_{q}" for q in questions]

# Display encoded data
print("\nEncoded Likert data:")
display(df_processed[['respondent_id'] + likert_columns_with_prefix].head())

# Display clustering results
print("\nQuestion clustering results:")
display(df_processed[['respondent_id', 'question_cluster_id', 'question_cluster_probability']].head())


# Use the cluster_heatmap_plot function to visualize cluster patterns
print("\nCluster heatmap showing the sentiment distribution across questions:")
heatmap = cluster_heatmap_plot(
    df=df_processed,
    x="question_cluster_id",  # Cluster IDs as the x-axis
    y=likert_columns_with_prefix,  # Encoded Likert columns to analyze
    max_width=30  # For better readability
)

# Display the heatmap
display(heatmap)

# Let's also add a simple interpretation of the clusters
cluster_summary = df_processed.groupby('question_cluster_id')[likert_columns_with_prefix].mean()
print("\nCluster averages for each question:")
display(cluster_summary)

# Calculate respondent counts per cluster
cluster_counts = df_processed['question_cluster_id'].value_counts().sort_index()
print("\nNumber of respondents in each cluster:")
display(cluster_counts)

Using default mapping:
-1: Phrases containing 'disagree', 'do not agree', etc.
 0: Phrases containing 'neutral', 'neither', 'unsure', etc.
+1: Phrases containing 'agree' (but not 'disagree' or 'not agree')
NaN: NaN values are preserved
  Agree -> 1: 282 times
  Strongly Agree -> 1: 199 times
  Neither Agree nor Disagree -> 0: 668 times
  Disagree -> -1: 293 times
  Strongly Disagree -> -1: 150 times

Encoded Likert data:

	respondent_id	likert_encoded_q1_ease_of_use	likert_encoded_q2_product_quality	likert_encoded_q3_value_for_money	likert_encoded_q4_customer_service	likert_encoded_q5_would_recommend	likert_encoded_q6_meets_expectations	likert_encoded_q7_better_than_competitors	likert_encoded_q8_overall_satisfaction
0	1	1	1	1	1	1	0	0	1
1	2	1	1	1	1	1	1	1	1
2	3	1	0	1	0	1	1	1	1
3	4	1	1	1	1	1	1	1	1
4	5	1	1	1	1	1	1	1	1


Question clustering results:

	respondent_id	question_cluster_probability
0	1	0.570234
1	2	1.000000
2	3	0.562724
3	4	1.000000
4	5	1.000000


Cluster heatmap showing the sentiment distribution across questions:


Cluster averages for each question:

	likert_encoded_q1_ease_of_use	likert_encoded_q2_product_quality	likert_encoded_q3_value_for_money	likert_encoded_q4_customer_service	likert_encoded_q5_would_recommend	likert_encoded_q6_meets_expectations	likert_encoded_q7_better_than_competitors	likert_encoded_q8_overall_satisfaction
question_cluster_id
0.0	0.852459	0.885246	0.819672	0.868852	0.918033	0.852459	0.885246	0.885246
1.0	0.040816	-0.081633	0.122449	0.081633	0.081633	-0.040816	0.183673	0.000000
2.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
3.0	-0.680556	-0.694444	-0.569444	-0.708333	-0.833333	-0.666667	-0.777778	-0.708333


Number of respondents in each cluster:

question_cluster_id
0.0    61
1.0    49
2.0    17
3.0    72
Name: count, dtype: int64

[ ]:

[ ]: