Slide 1: Introduction to Distance Metrics
Distance metrics are mathematical functions used to measure the similarity or dissimilarity between two points in a given space. They play a crucial role in various fields, including machine learning, data analysis, and information retrieval. This slideshow will demystify distance metrics and demonstrate their implementation using Python.
import numpy as np
def euclidean_distance(point1, point2):
return np.sqrt(np.sum((point1 - point2)**2))
# Example usage
p1 = np.array([1, 2, 3])
p2 = np.array([4, 5, 6])
distance = euclidean_distance(p1, p2)
print(f"Euclidean distance between {p1} and {p2}: {distance:.2f}")Slide 2: Euclidean Distance
Euclidean distance is the most common distance metric, representing the straight-line distance between two points in Euclidean space. It is calculated as the square root of the sum of squared differences between corresponding coordinates.
import numpy as np
import matplotlib.pyplot as plt
def plot_euclidean_distance(p1, p2):
plt.figure(figsize=(8, 6))
plt.plot([p1[0], p2[0]], [p1[1], p2[1]], 'ro-')
plt.plot([p1[0], p2[0]], [p1[1], p1[1]], 'b--')
plt.plot([p2[0], p2[0]], [p1[1], p2[1]], 'b--')
plt.text((p1[0] + p2[0])/2, p1[1] - 0.1, f'{abs(p2[0] - p1[0]):.2f}', ha='center')
plt.text(p2[0] + 0.1, (p1[1] + p2[1])/2, f'{abs(p2[1] - p1[1]):.2f}', va='center')
plt.text((p1[0] + p2[0])/2, (p1[1] + p2[1])/2, f'{euclidean_distance(p1, p2):.2f}', ha='center', va='center', bbox=dict(facecolor='white', alpha=0.5))
plt.xlim(min(p1[0], p2[0]) - 1, max(p1[0], p2[0]) + 1)
plt.ylim(min(p1[1], p2[1]) - 1, max(p1[1], p2[1]) + 1)
plt.grid(True)
plt.title('Euclidean Distance Visualization')
plt.show()
p1 = np.array([1, 1])
p2 = np.array([4, 5])
plot_euclidean_distance(p1, p2)Slide 3: Manhattan Distance
Manhattan distance, also known as L1 distance or city block distance, measures the sum of absolute differences between coordinates. It represents the distance a taxi would drive in a city laid out in a grid-like pattern.
def manhattan_distance(point1, point2):
return np.sum(np.abs(point1 - point2))
p1 = np.array([1, 2, 3])
p2 = np.array([4, 5, 6])
distance = manhattan_distance(p1, p2)
print(f"Manhattan distance between {p1} and {p2}: {distance}")
# Visualizing Manhattan vs Euclidean distance
def plot_manhattan_vs_euclidean(p1, p2):
plt.figure(figsize=(8, 6))
plt.plot([p1[0], p2[0]], [p1[1], p1[1]], 'r-', label='Manhattan')
plt.plot([p2[0], p2[0]], [p1[1], p2[1]], 'r-')
plt.plot([p1[0], p2[0]], [p1[1], p2[1]], 'b--', label='Euclidean')
plt.plot(p1[0], p1[1], 'go', label='Point 1')
plt.plot(p2[0], p2[1], 'go', label='Point 2')
plt.legend()
plt.grid(True)
plt.title('Manhattan vs Euclidean Distance')
plt.show()
plot_manhattan_vs_euclidean(np.array([1, 1]), np.array([4, 5]))Slide 4: Minkowski Distance
Minkowski distance is a generalization of Euclidean and Manhattan distances. It is parameterized by a value p, where p=1 gives Manhattan distance, p=2 gives Euclidean distance, and p→∞ gives Chebyshev distance.
def minkowski_distance(point1, point2, p):
return np.power(np.sum(np.abs(point1 - point2)**p), 1/p)
p1 = np.array([1, 2, 3])
p2 = np.array([4, 5, 6])
for p in [1, 2, 3, np.inf]:
distance = minkowski_distance(p1, p2, p)
print(f"Minkowski distance (p={p}) between {p1} and {p2}: {distance:.2f}")Slide 5: Cosine Similarity
Cosine similarity measures the cosine of the angle between two vectors, indicating their directional similarity regardless of magnitude. It's particularly useful in text analysis and recommendation systems.
def cosine_similarity(vec1, vec2):
dot_product = np.dot(vec1, vec2)
norm1 = np.linalg.norm(vec1)
norm2 = np.linalg.norm(vec2)
return dot_product / (norm1 * norm2)
v1 = np.array([1, 2, 3])
v2 = np.array([4, 5, 6])
similarity = cosine_similarity(v1, v2)
print(f"Cosine similarity between {v1} and {v2}: {similarity:.4f}")
# Visualizing cosine similarity
def plot_cosine_similarity(v1, v2):
plt.figure(figsize=(8, 6))
plt.quiver(0, 0, v1[0], v1[1], angles='xy', scale_units='xy', scale=1, color='r', label='Vector 1')
plt.quiver(0, 0, v2[0], v2[1], angles='xy', scale_units='xy', scale=1, color='b', label='Vector 2')
plt.xlim(-1, max(v1[0], v2[0]) + 1)
plt.ylim(-1, max(v1[1], v2[1]) + 1)
plt.legend()
plt.grid(True)
plt.title(f'Cosine Similarity: {cosine_similarity(v1, v2):.4f}')
plt.show()
plot_cosine_similarity(np.array([1, 2]), np.array([2, 3]))Slide 6: Hamming Distance
Hamming distance measures the number of positions at which corresponding symbols in two equal-length strings or vectors are different. It's commonly used in information theory and error detection.
def hamming_distance(s1, s2):
if len(s1) != len(s2):
raise ValueError("Strings must have equal length")
return sum(c1 != c2 for c1, c2 in zip(s1, s2))
string1 = "karolin"
string2 = "kathrin"
distance = hamming_distance(string1, string2)
print(f"Hamming distance between '{string1}' and '{string2}': {distance}")
# Visualizing Hamming distance
def visualize_hamming_distance(s1, s2):
fig, ax = plt.subplots(figsize=(10, 2))
ax.axis('off')
for i, (c1, c2) in enumerate(zip(s1, s2)):
color = 'red' if c1 != c2 else 'green'
ax.text(i, 0.6, c1, ha='center', va='center', color=color, fontsize=20)
ax.text(i, 0.4, c2, ha='center', va='center', color=color, fontsize=20)
plt.title(f"Hamming Distance: {hamming_distance(s1, s2)}")
plt.tight_layout()
plt.show()
visualize_hamming_distance(string1, string2)Slide 7: Levenshtein Distance
Levenshtein distance, also known as edit distance, measures the minimum number of single-character edits (insertions, deletions, or substitutions) required to change one word into another. It's widely used in natural language processing and bioinformatics.
def levenshtein_distance(s1, s2):
if len(s1) < len(s2):
return levenshtein_distance(s2, s1)
if len(s2) == 0:
return len(s1)
previous_row = range(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
word1 = "kitten"
word2 = "sitting"
distance = levenshtein_distance(word1, word2)
print(f"Levenshtein distance between '{word1}' and '{word2}': {distance}")Slide 8: Jaccard Similarity
Jaccard similarity measures the similarity between finite sample sets by comparing the size of their intersection to the size of their union. It's particularly useful for comparing text documents or gene sequences.
def jaccard_similarity(set1, set2):
intersection = len(set1.intersection(set2))
union = len(set1.union(set2))
return intersection / union
text1 = "The quick brown fox"
text2 = "The fast brown dog"
set1 = set(text1.lower().split())
set2 = set(text2.lower().split())
similarity = jaccard_similarity(set1, set2)
print(f"Jaccard similarity between '{text1}' and '{text2}': {similarity:.4f}")
# Visualizing Jaccard similarity
def plot_jaccard_similarity(set1, set2):
plt.figure(figsize=(8, 6))
venn2([set1, set2], ('Set 1', 'Set 2'))
plt.title(f'Jaccard Similarity: {jaccard_similarity(set1, set2):.4f}')
plt.show()
plot_jaccard_similarity(set1, set2)Slide 9: Real-Life Example: Image Similarity
Distance metrics can be used to compare images by treating them as high-dimensional vectors. Here's an example using Mean Squared Error (MSE) to compare two images:
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
def mse_distance(img1, img2):
return np.mean((img1 - img2) ** 2)
# Load and preprocess images
def load_image(path):
return np.array(Image.open(path).convert('L').resize((100, 100)))
img1 = load_image('image1.jpg')
img2 = load_image('image2.jpg')
distance = mse_distance(img1, img2)
print(f"MSE distance between images: {distance:.2f}")
# Visualize images and their difference
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
ax1.imshow(img1, cmap='gray')
ax1.set_title('Image 1')
ax2.imshow(img2, cmap='gray')
ax2.set_title('Image 2')
ax3.imshow(np.abs(img1 - img2), cmap='hot')
ax3.set_title('Absolute Difference')
plt.show()Slide 10: Real-Life Example: Text Document Similarity
Distance metrics are crucial in natural language processing for tasks like document clustering and information retrieval. Here's an example using cosine similarity to compare text documents:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
documents = [
"The quick brown fox jumps over the lazy dog",
"A fast brown dog leaps over the sleepy cat",
"Python is a versatile programming language",
]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
similarity_matrix = cosine_similarity(tfidf_matrix)
print("Document Similarity Matrix:")
print(similarity_matrix)
# Visualize similarity matrix
plt.figure(figsize=(8, 6))
plt.imshow(similarity_matrix, cmap='YlOrRd')
plt.colorbar()
plt.title('Document Similarity Matrix')
plt.xlabel('Document Index')
plt.ylabel('Document Index')
plt.show()Slide 11: Choosing the Right Distance Metric
The choice of distance metric depends on the nature of your data and the problem you're trying to solve. Consider these factors:
- Data type: Continuous, categorical, or mixed
- Dimensionality: Low or high-dimensional data
- Scale sensitivity: Whether differences in scale matter
- Interpretability: How easily the metric can be understood
- Computational efficiency: Important for large datasets
Example: For text data, cosine similarity is often preferred as it focuses on the direction of vectors rather than their magnitude.
# Comparison of different distance metrics
from scipy.spatial.distance import euclidean, cityblock, cosine
v1 = np.array([1, 2, 3])
v2 = np.array([4, 5, 6])
print(f"Euclidean distance: {euclidean(v1, v2):.4f}")
print(f"Manhattan distance: {cityblock(v1, v2):.4f}")
print(f"Cosine distance: {cosine(v1, v2):.4f}")Slide 12: Implementing Custom Distance Metrics
Sometimes, you may need to implement a custom distance metric tailored to your specific problem. Here's an example of a weighted Euclidean distance:
import numpy as np
import matplotlib.pyplot as plt
def weighted_euclidean_distance(point1, point2, weights):
return np.sqrt(np.sum(weights * (point1 - point2)**2))
p1 = np.array([1, 2, 3])
p2 = np.array([4, 5, 6])
weights = np.array([0.5, 1, 2]) # More weight on the third dimension
distance = weighted_euclidean_distance(p1, p2, weights)
print(f"Weighted Euclidean distance: {distance:.4f}")
# Visualizing the effect of weights
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
ax1.bar(range(3), np.abs(p1 - p2), label='Unweighted')
ax1.bar(range(3), weights * np.abs(p1 - p2), alpha=0.5, label='Weighted')
ax1.set_title('Dimension-wise Differences')
ax1.legend()
ax2.scatter(p1[0], p1[1], s=100, label='Point 1')
ax2.scatter(p2[0], p2[1], s=100, label='Point 2')
ax2.plot([p1[0], p2[0]], [p1[1], p2[1]], 'r--')
ax2.set_title('2D Projection')
ax2.legend()
plt.tight_layout()
plt.show()Slide 13: Optimizing Distance Computations
When working with large datasets, efficient distance computation becomes crucial. Here are some strategies to optimize distance calculations:
import numpy as np
from scipy.spatial.distance import cdist
# Generate random data
n_samples = 1000
n_features = 10
X = np.random.rand(n_samples, n_features)
Y = np.random.rand(n_samples, n_features)
# Efficient pairwise distance computation
distances = cdist(X, Y, metric='euclidean')
print(f"Shape of distance matrix: {distances.shape}")
print(f"Mean distance: {np.mean(distances):.4f}")
# K-nearest neighbors using efficient algorithms
from sklearn.neighbors import NearestNeighbors
k = 5
nn = NearestNeighbors(n_neighbors=k, algorithm='ball_tree')
nn.fit(X)
distances, indices = nn.kneighbors(Y)
print(f"Shape of k-NN distances: {distances.shape}")
print(f"Mean k-NN distance: {np.mean(distances):.4f}")Slide 14: Dimensionality Reduction and Distance Metrics
High-dimensional data can suffer from the "curse of dimensionality," affecting distance metrics. Dimensionality reduction techniques can help mitigate this issue:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
# Generate high-dimensional data
n_samples = 1000
n_features = 50
X = np.random.rand(n_samples, n_features)
# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)
# Visualize results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
ax1.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.5)
ax1.set_title('PCA')
ax2.scatter(X_tsne[:, 0], X_tsne[:, 1], alpha=0.5)
ax2.set_title('t-SNE')
plt.tight_layout()
plt.show()
print(f"Explained variance ratio (PCA): {pca.explained_variance_ratio_}")Slide 15: Additional Resources
For further exploration of distance metrics and their applications, consider these resources:
- "A Survey of Binary Similarity and Distance Measures" by Seung-Seok Choi et al. (2010) ArXiv: https://arxiv.org/abs/1002.0184
- "Similarity Measures for Text Document Clustering" by Anna Huang (2008) Available at: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.332.4480
- "An Introduction to Information Retrieval" by Christopher D. Manning et al. Available at: https://nlp.stanford.edu/IR-book/
These resources provide in-depth discussions on various distance metrics, their properties, and applications in different domains such as information retrieval, natural language processing, and data mining.