Skip to content

Latest commit

 

History

History
400 lines (310 loc) · 12.7 KB

File metadata and controls

400 lines (310 loc) · 12.7 KB

Correlation Analysis in Python

Slide 1: Understanding Correlation Fundamentals

Correlation analysis measures the strength and direction of relationships between variables in statistical data. The concept is fundamental in data science for identifying patterns, making predictions, and understanding variable dependencies in complex datasets.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Generate sample data
np.random.seed(42)
x = np.random.normal(0, 1, 1000)
y = 0.7 * x + np.random.normal(0, 0.5, 1000)  # Positive correlation

# Calculate Pearson correlation
correlation = np.corrcoef(x, y)[0, 1]

# Visualization
plt.figure(figsize=(10, 6))
plt.scatter(x, y, alpha=0.5)
plt.title(f'Correlation Example (r = {correlation:.2f})')
plt.xlabel('Variable X')
plt.ylabel('Variable Y')
plt.show()

Slide 2: Types of Correlation

Statistical correlation can manifest in three primary forms: positive correlation where variables increase together, negative correlation where one increases as the other decreases, and zero correlation indicating no relationship between variables.

# Generate different correlation types
x = np.linspace(-3, 3, 100)
y_positive = 0.8 * x + np.random.normal(0, 0.3, 100)
y_negative = -0.8 * x + np.random.normal(0, 0.3, 100)
y_zero = np.random.normal(0, 1, 100)

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))

ax1.scatter(x, y_positive)
ax1.set_title('Positive Correlation')

ax2.scatter(x, y_negative)
ax2.set_title('Negative Correlation')

ax3.scatter(x, y_zero)
ax3.set_title('No Correlation')

plt.tight_layout()
plt.show()

Slide 3: Pearson Correlation Coefficient

The Pearson correlation coefficient quantifies linear relationships between continuous variables, producing values between -1 and 1. This mathematical formula represents the covariance of variables divided by the product of their standard deviations.

def pearson_correlation(x, y):
    """
    Calculate Pearson correlation coefficient from scratch
    """
    x_mean, y_mean = np.mean(x), np.mean(y)
    numerator = np.sum((x - x_mean) * (y - y_mean))
    denominator = np.sqrt(np.sum((x - x_mean)**2) * np.sum((y - y_mean)**2))
    return numerator / denominator

# Example calculation
x = np.array([1, 2, 3, 4, 5])
y = np.array([2, 4, 5, 4, 5])
correlation = pearson_correlation(x, y)
print(f"Correlation coefficient: {correlation:.4f}")

Slide 4: Marketing Analytics Case Study - Data Preparation

A real-world marketing dataset analyzing the relationship between advertising spend and sales across different channels. This comprehensive analysis helps identify the most effective marketing channels for budget allocation.

# Create sample marketing dataset
np.random.seed(42)
n_samples = 1000

data = {
    'tv_spend': np.random.uniform(10000, 100000, n_samples),
    'social_media_spend': np.random.uniform(5000, 50000, n_samples),
    'email_spend': np.random.uniform(1000, 20000, n_samples)
}

# Generate sales with realistic correlations
data['sales'] = (
    0.6 * data['tv_spend'] +
    0.3 * data['social_media_spend'] +
    0.1 * data['email_spend'] +
    np.random.normal(0, 10000, n_samples)
)

df = pd.DataFrame(data)
print(df.head())
print("\nData Summary:")
print(df.describe())

Slide 5: Marketing Analytics Case Study - Correlation Analysis

Analyzing correlations between different marketing channels and sales reveals the effectiveness of each channel. This analysis guides marketing budget optimization and helps identify synergies between channels.

# Calculate correlation matrix
correlation_matrix = df.corr()

# Create heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, 
            annot=True, 
            cmap='coolwarm', 
            vmin=-1, 
            vmax=1, 
            center=0)
plt.title('Marketing Channels Correlation Matrix')
plt.show()

# Calculate and display channel-to-sales correlations
channel_correlations = correlation_matrix['sales'].sort_values(ascending=False)
print("\nCorrelations with Sales:")
print(channel_correlations)

Slide 6: Nonlinear Correlation Analysis

Traditional Pearson correlation fails to capture nonlinear relationships. Spearman and Kendall correlation coefficients provide alternative measures for monotonic relationships that may not be strictly linear.

# Generate nonlinear relationship
x = np.linspace(0, 10, 1000)
y = x**2 + np.random.normal(0, 5, 1000)

# Calculate different correlation coefficients
pearson_corr = np.corrcoef(x, y)[0,1]
spearman_corr = pd.Series(x).corr(pd.Series(y), method='spearman')
kendall_corr = pd.Series(x).corr(pd.Series(y), method='kendall')

plt.figure(figsize=(10, 6))
plt.scatter(x, y, alpha=0.5)
plt.title(f'Nonlinear Relationship\nPearson: {pearson_corr:.2f}, Spearman: {spearman_corr:.2f}, Kendall: {kendall_corr:.2f}')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()

Slide 7: Advanced Correlation Matrix Visualization

Correlation matrix visualization provides insights into relationships between multiple variables simultaneously. This advanced implementation includes hierarchical clustering to group related variables and customizable visualization parameters for enhanced interpretation.

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster import hierarchy
from scipy.stats import spearmanr

def plot_corr_matrix(data, method='pearson', figsize=(10, 8)):
    # Calculate correlation matrix
    corr = data.corr(method=method)
    
    # Generate mask for upper triangle
    mask = np.triu(np.ones_like(corr), k=1)
    
    # Create clustered heatmap
    plt.figure(figsize=figsize)
    sns.clustermap(corr,
                  annot=True,
                  fmt='.2f',
                  cmap='coolwarm',
                  center=0,
                  vmin=-1,
                  vmax=1,
                  mask=mask)
    plt.title('Hierarchically Clustered Correlation Matrix')
    return corr

# Example usage
np.random.seed(42)
n_samples = 100
data = pd.DataFrame({
    'var1': np.random.normal(0, 1, n_samples),
    'var2': np.random.normal(0, 1, n_samples),
    'var3': np.random.normal(0, 1, n_samples)
})
data['var4'] = 0.7 * data['var1'] + np.random.normal(0, 0.3, n_samples)

correlation_matrix = plot_corr_matrix(data)
print("\nCorrelation Matrix:")
print(correlation_matrix.round(3))

Slide 8: Time Series Correlation Analysis

Time series correlation analysis examines relationships between variables across different time periods, incorporating lag effects and temporal dependencies to understand dynamic relationships in sequential data.

def analyze_time_correlation(series1, series2, max_lag=5):
    correlations = []
    for lag in range(max_lag + 1):
        if lag == 0:
            corr = np.corrcoef(series1, series2)[0,1]
        else:
            corr = np.corrcoef(series1[lag:], series2[:-lag])[0,1]
        correlations.append((lag, corr))
    return correlations

# Generate time series data
t = np.linspace(0, 10, 1000)
series1 = np.sin(t) + np.random.normal(0, 0.1, len(t))
series2 = np.sin(t + 0.5) + np.random.normal(0, 0.1, len(t))

# Calculate lagged correlations
lag_correlations = analyze_time_correlation(series1, series2)

# Plot results
plt.figure(figsize=(10, 6))
lags, corrs = zip(*lag_correlations)
plt.plot(lags, corrs, 'o-')
plt.xlabel('Lag')
plt.ylabel('Correlation')
plt.title('Time-Lagged Correlation Analysis')
plt.grid(True)
print("Lag Correlations:", dict(lag_correlations))

Slide 9: Robust Correlation Methods

Robust correlation methods handle outliers and non-normal distributions effectively. These techniques provide reliable correlation estimates when data violates assumptions of traditional Pearson correlation.

def robust_correlation(x, y, method='spearman'):
    if method == 'spearman':
        corr, p_value = spearmanr(x, y)
    elif method == 'kendall':
        corr, p_value = kendalltau(x, y)
    else:
        raise ValueError("Method must be 'spearman' or 'kendall'")
    return corr, p_value

# Generate data with outliers
np.random.seed(42)
x = np.random.normal(0, 1, 100)
y = 0.8 * x + np.random.normal(0, 0.2, 100)
# Add outliers
x[0] = 10
y[0] = -10

# Compare correlation methods
methods = ['pearson', 'spearman', 'kendall']
results = {}

for method in methods:
    if method == 'pearson':
        corr = np.corrcoef(x, y)[0,1]
    else:
        corr, _ = robust_correlation(x, y, method)
    results[method] = corr

print("Correlation Results:")
for method, corr in results.items():
    print(f"{method.capitalize()}: {corr:.3f}")

Slide 10: Correlation Significance Testing

Statistical significance testing for correlations determines whether observed relationships are likely to occur by chance. This implementation includes p-value calculation and confidence interval estimation.

def correlation_significance(x, y, alpha=0.05):
    n = len(x)
    r = np.corrcoef(x, y)[0,1]
    
    # Calculate t-statistic
    t = r * np.sqrt((n-2)/(1-r**2))
    
    # Calculate p-value
    from scipy.stats import t as t_dist
    p_value = 2 * (1 - t_dist.cdf(abs(t), n-2))
    
    # Calculate confidence interval
    z = np.arctanh(r)
    se = 1/np.sqrt(n-3)
    ci_lower = np.tanh(z - se * 1.96)
    ci_upper = np.tanh(z + se * 1.96)
    
    return {
        'correlation': r,
        'p_value': p_value,
        'ci_lower': ci_lower,
        'ci_upper': ci_upper,
        'significant': p_value < alpha
    }

# Example usage
x = np.random.normal(0, 1, 100)
y = 0.3 * x + np.random.normal(0, 0.9, 100)

results = correlation_significance(x, y)
for key, value in results.items():
    print(f"{key}: {value:.4f}")

Slide 11: Distance Correlation Analysis

Distance correlation measures dependence between variables of different dimensions, capturing both linear and nonlinear relationships. This implementation provides a more general measure of statistical dependence.

def distance_correlation(x, y):
    x = np.atleast_1d(x)
    y = np.atleast_1d(y)
    
    if x.shape[0] != y.shape[0]:
        raise ValueError('Arrays must have same length')
        
    n = x.shape[0]
    
    # Calculate distance matrices
    def dist_matrix(arr):
        return np.sqrt(np.sum((arr[:,None] - arr)**2, axis=-1))
    
    dx = dist_matrix(x.reshape(-1,1))
    dy = dist_matrix(y.reshape(-1,1))
    
    # Double center distance matrices
    def double_center(D):
        row_mean = D.mean(axis=0)
        col_mean = D.mean(axis=1)
        total_mean = D.mean()
        return D - row_mean - col_mean[:,None] + total_mean
    
    dcx = double_center(dx)
    dcy = double_center(dy)
    
    # Calculate distance correlation
    dcov = (dcx * dcy).mean()
    dvarx = (dcx * dcx).mean()
    dvary = (dcy * dcy).mean()
    
    return np.sqrt(dcov) / np.sqrt(np.sqrt(dvarx * dvary))

# Example with nonlinear relationship
x = np.random.uniform(0, 2*np.pi, 100)
y = np.sin(x) + np.random.normal(0, 0.1, 100)

dc = distance_correlation(x, y)
pc = np.corrcoef(x, y)[0,1]

print(f"Distance Correlation: {dc:.4f}")
print(f"Pearson Correlation: {pc:.4f}")

Slide 12: Partial Correlation Analysis

Partial correlation measures the relationship between two variables while controlling for the effects of other variables, revealing direct relationships in multivariate systems.

def partial_correlation(data, x, y, controlling_vars):
    def residuals(a, b):
        slope, intercept = np.polyfit(b, a, 1)
        return a - (slope * b + intercept)
    
    x_resid = data[x]
    y_resid = data[y]
    
    for control in controlling_vars:
        x_resid = residuals(x_resid, data[control])
        y_resid = residuals(y_resid, data[control])
    
    return np.corrcoef(x_resid, y_resid)[0,1]

# Example dataset
np.random.seed(42)
n = 1000
data = pd.DataFrame({
    'x': np.random.normal(0, 1, n),
    'y': np.random.normal(0, 1, n),
    'z': np.random.normal(0, 1, n)
})
data['y'] = 0.5 * data['x'] + 0.3 * data['z'] + np.random.normal(0, 0.5, n)

# Calculate correlations
regular_corr = np.corrcoef(data['x'], data['y'])[0,1]
partial_corr = partial_correlation(data, 'x', 'y', ['z'])

print(f"Regular correlation: {regular_corr:.4f}")
print(f"Partial correlation (controlling for z): {partial_corr:.4f}")

Slide 13: Additional Resources

https://arxiv.org/abs/2007.02731 - "Robust Correlation Analysis: A Review of Recent Developments with Applications to Data Science" https://arxiv.org/abs/1804.02532 - "Distance Correlation: A New Tool for Detecting Association and Measuring Correlation Between Data Sets" https://arxiv.org/abs/1909.10140 - "Modern Applications of Correlation Analysis in Time Series Data" https://arxiv.org/abs/2103.05825 - "Partial Correlation in High-Dimensional Data Analysis"