Machine Learning with Scikit-Learn: Complete Guide
Table of Contents
- Introduction to Scikit-Learn
- Data Preprocessing
- Supervised Learning
- Unsupervised Learning
- Model Evaluation
- Feature Engineering
- Pipeline Creation
- Model Selection
- Advanced Techniques
- Production Deployment
Introduction to Scikit-Learn {#introduction}
Scikit-learn is the most popular machine learning library for Python, providing simple and efficient tools for data mining and data analysis. It's built on NumPy, SciPy, and matplotlib and offers a consistent API for various machine learning algorithms.
Key Features
- Simple and efficient: Easy-to-use API
- Comprehensive: Wide range of algorithms
- Well-documented: Excellent documentation and examples
- Active community: Regular updates and support
- Production-ready: Used in many production systems
Installation and Setup
# Install scikit-learn
pip install scikit-learn
# Import common modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
Basic Workflow
# 1. Load and explore data
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
# 2. Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 3. Choose and train model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
# 4. Make predictions
y_pred = model.predict(X_test)
# 5. Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")
Data Preprocessing {#preprocessing}
Handling Missing Data
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# Sample data with missing values
data = pd.DataFrame({
'age': [25, 30, np.nan, 35, 40],
'income': [50000, np.nan, 75000, 80000, np.nan],
'score': [85, 90, 88, np.nan, 92]
})
# Simple imputation strategies
# Mean imputation
mean_imputer = SimpleImputer(strategy='mean')
data_mean_imputed = pd.DataFrame(
mean_imputer.fit_transform(data),
columns=data.columns
)
# Median imputation
median_imputer = SimpleImputer(strategy='median')
data_median_imputed = pd.DataFrame(
median_imputer.fit_transform(data),
columns=data.columns
)
# Mode imputation (for categorical data)
mode_imputer = SimpleImputer(strategy='most_frequent')
# KNN imputation
knn_imputer = KNNImputer(n_neighbors=2)
data_knn_imputed = pd.DataFrame(
knn_imputer.fit_transform(data),
columns=data.columns
)
# Iterative imputation (MICE)
iterative_imputer = IterativeImputer(random_state=42)
data_iterative_imputed = pd.DataFrame(
iterative_imputer.fit_transform(data),
columns=data.columns
)
Feature Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
# Sample data
X = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9],
[10, 11, 12]])
# Standard Scaling (z-score normalization)
standard_scaler = StandardScaler()
X_standard = standard_scaler.fit_transform(X)
print("Standard scaled:")
print(X_standard)
# Min-Max Scaling (0-1 normalization)
minmax_scaler = MinMaxScaler()
X_minmax = minmax_scaler.fit_transform(X)
print("Min-Max scaled:")
print(X_minmax)
# Robust Scaling (uses median and IQR)
robust_scaler = RobustScaler()
X_robust = robust_scaler.fit_transform(X)
print("Robust scaled:")
print(X_robust)
# Power Transformer (Yeo-Johnson)
power_transformer = PowerTransformer(method='yeo-johnson')
X_power = power_transformer.fit_transform(X)
# Quantile Transformer (uniform distribution)
quantile_transformer = QuantileTransformer(output_distribution='uniform')
X_quantile = quantile_transformer.fit_transform(X)
Encoding Categorical Variables
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
# Sample categorical data
data = pd.DataFrame({
'color': ['red', 'blue', 'green', 'red', 'blue'],
'size': ['S', 'M', 'L', 'M', 'S'],
'rating': ['good', 'excellent', 'poor', 'good', 'excellent']
})
# Label Encoding (for ordinal categories)
label_encoder = LabelEncoder()
data['size_encoded'] = label_encoder.fit_transform(data['size'])
# Ordinal Encoding (with custom order)
ordinal_encoder = OrdinalEncoder(
categories=[['poor', 'good', 'excellent']]
)
data['rating_encoded'] = ordinal_encoder.fit_transform(
data[['rating']]
).flatten()
# One-Hot Encoding
onehot_encoder = OneHotEncoder(sparse=False, drop='first')
color_encoded = onehot_encoder.fit_transform(data[['color']])
color_feature_names = onehot_encoder.get_feature_names_out(['color'])
# Add one-hot encoded features to dataframe
for i, feature_name in enumerate(color_feature_names):
data[feature_name] = color_encoded[:, i]
print(data)
Feature Selection
from sklearn.feature_selection import (
SelectKBest, f_classif, mutual_info_classif,
RFE, SelectFromModel
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso
# Load example data
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X, y = data.data, data.target
# Univariate feature selection
selector_kbest = SelectKBest(score_func=f_classif, k=10)
X_kbest = selector_kbest.fit_transform(X, y)
# Get selected feature indices
selected_features = selector_kbest.get_support(indices=True)
print(f"Selected features: {selected_features}")
# Recursive Feature Elimination (RFE)
estimator = RandomForestClassifier(random_state=42)
rfe_selector = RFE(estimator=estimator, n_features_to_select=10)
X_rfe = rfe_selector.fit_transform(X, y)
# Feature selection based on model importance
lasso = Lasso(alpha=0.01, random_state=42)
sfm_selector = SelectFromModel(lasso)
X_sfm = sfm_selector.fit_transform(X, y)
print(f"Original features: {X.shape[1]}")
print(f"K-best features: {X_kbest.shape[1]}")
print(f"RFE features: {X_rfe.shape[1]}")
print(f"SelectFromModel features: {X_sfm.shape[1]}")
Supervised Learning {#supervised}
Classification Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
# Load dataset
from sklearn.datasets import load_wine
wine = load_wine()
X, y = wine.data, wine.target
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Dictionary of classifiers
classifiers = {
'Logistic Regression': LogisticRegression(random_state=42),
'Decision Tree': DecisionTreeClassifier(random_state=42),
'Random Forest': RandomForestClassifier(random_state=42),
'Gradient Boosting': GradientBoostingClassifier(random_state=42),
'SVM': SVC(random_state=42),
'Naive Bayes': GaussianNB(),
'KNN': KNeighborsClassifier()
}
# Train and evaluate each classifier
results = {}
for name, clf in classifiers.items():
# Use scaled data for algorithms that need it
if name in ['Logistic Regression', 'SVM', 'KNN']:
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
else:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
results[name] = accuracy
print(f"{name} Accuracy: {accuracy:.3f}")
print(f"Classification Report for {name}:")
print(classification_report(y_test, y_pred))
print("-" * 50)
# Find best classifier
best_classifier = max(results, key=results.get)
print(f"Best classifier: {best_classifier} with accuracy: {results[best_classifier]:.3f}")
Regression Algorithms
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
# Load dataset
from sklearn.datasets import load_boston
boston = load_boston()
X, y = boston.data, boston.target
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Dictionary of regressors
regressors = {
'Linear Regression': LinearRegression(),
'Ridge Regression': Ridge(alpha=1.0),
'Lasso Regression': Lasso(alpha=1.0),
'Elastic Net': ElasticNet(alpha=1.0, l1_ratio=0.5),
'Decision Tree': DecisionTreeRegressor(random_state=42),
'Random Forest': RandomForestRegressor(random_state=42),
'Gradient Boosting': GradientBoostingRegressor(random_state=42),
'SVR': SVR(),
'KNN': KNeighborsRegressor()
}
# Train and evaluate each regressor
results = {}
for name, reg in regressors.items():
# Use scaled data for algorithms that need it
if name in ['Linear Regression', 'Ridge Regression', 'Lasso Regression',
'Elastic Net', 'SVR', 'KNN']:
reg.fit(X_train_scaled, y_train)
y_pred = reg.predict(X_test_scaled)
else:
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
results[name] = {
'MSE': mse,
'RMSE': rmse,
'MAE': mae,
'R²': r2
}
print(f"{name}:")
print(f" MSE: {mse:.3f}")
print(f" RMSE: {rmse:.3f}")
print(f" MAE: {mae:.3f}")
print(f" R²: {r2:.3f}")
print("-" * 30)
Unsupervised Learning {#unsupervised}
Clustering Algorithms
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, adjusted_rand_score
import matplotlib.pyplot as plt
# Generate sample data
from sklearn.datasets import make_blobs
X, y_true = make_blobs(n_samples=300, centers=4, n_features=2,
random_state=42, cluster_std=0.60)
# K-Means Clustering
kmeans = KMeans(n_clusters=4, random_state=42)
y_kmeans = kmeans.fit_predict(X)
# DBSCAN Clustering
dbscan = DBSCAN(eps=0.3, min_samples=10)
y_dbscan = dbscan.fit_predict(X)
# Hierarchical Clustering
hierarchical = AgglomerativeClustering(n_clusters=4)
y_hierarchical = hierarchical.fit_predict(X)
# Gaussian Mixture Model
gmm = GaussianMixture(n_components=4, random_state=42)
y_gmm = gmm.fit_predict(X)
# Evaluate clustering results
clustering_results = {
'K-Means': y_kmeans,
'DBSCAN': y_dbscan,
'Hierarchical': y_hierarchical,
'GMM': y_gmm
}
for name, labels in clustering_results.items():
if len(set(labels)) > 1: # Check if clustering found more than one cluster
silhouette = silhouette_score(X, labels)
ari = adjusted_rand_score(y_true, labels)
print(f"{name}:")
print(f" Silhouette Score: {silhouette:.3f}")
print(f" Adjusted Rand Index: {ari:.3f}")
print(f" Number of clusters: {len(set(labels))}")
else:
print(f"{name}: Failed to find multiple clusters")
print("-" * 30)
# Plotting results
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('Clustering Results')
for i, (name, labels) in enumerate(clustering_results.items()):
row = i // 2
col = i % 2
scatter = axes[row, col].scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis')
axes[row, col].set_title(name)
axes[row, col].set_xlabel('Feature 1')
axes[row, col].set_ylabel('Feature 2')
plt.tight_layout()
plt.show()
Dimensionality Reduction
from sklearn.decomposition import PCA, TruncatedSVD, FastICA
from sklearn.manifold import TSNE, Isomap
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import matplotlib.pyplot as plt
# Load high-dimensional dataset
from sklearn.datasets import load_digits
digits = load_digits()
X, y = digits.data, digits.target
print(f"Original shape: {X.shape}")
# Principal Component Analysis (PCA)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
print(f"PCA explained variance ratio: {pca.explained_variance_ratio_}")
# Linear Discriminant Analysis (LDA)
lda = LinearDiscriminantAnalysis(n_components=2)
X_lda = lda.fit_transform(X, y)
# t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X)
# Isomap
isomap = Isomap(n_components=2)
X_isomap = isomap.fit_transform(X)
# Independent Component Analysis (ICA)
ica = FastICA(n_components=2, random_state=42)
X_ica = ica.fit_transform(X)
# Plot results
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Dimensionality Reduction Techniques')
techniques = [
('PCA', X_pca),
('LDA', X_lda),
('t-SNE', X_tsne),
('Isomap', X_isomap),
('ICA', X_ica)
]
for i, (name, X_reduced) in enumerate(techniques):
if i < 6: # We only have 6 subplots
row = i // 3
col = i % 3
scatter = axes[row, col].scatter(X_reduced[:, 0], X_reduced[:, 1],
c=y, cmap='tab10', alpha=0.7)
axes[row, col].set_title(name)
axes[row, col].set_xlabel('Component 1')
axes[row, col].set_ylabel('Component 2')
# Remove empty subplot
fig.delaxes(axes[1, 2])
plt.tight_layout()
plt.show()
# PCA with explained variance analysis
pca_full = PCA()
pca_full.fit(X)
# Plot cumulative explained variance
plt.figure(figsize=(10, 6))
cumsum = np.cumsum(pca_full.explained_variance_ratio_)
plt.plot(range(1, len(cumsum) + 1), cumsum, 'bo-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('PCA Explained Variance')
plt.grid(True)
# Find number of components for 95% variance
n_components_95 = np.argmax(cumsum >= 0.95) + 1
plt.axhline(y=0.95, color='r', linestyle='--', label='95% Variance')
plt.axvline(x=n_components_95, color='r', linestyle='--',
label=f'{n_components_95} Components')
plt.legend()
plt.show()
print(f"Number of components for 95% variance: {n_components_95}")
Model Evaluation {#evaluation}
Cross-Validation
from sklearn.model_selection import (
cross_val_score, cross_validate, StratifiedKFold,
TimeSeriesSplit, LeaveOneOut, ShuffleSplit
)
from sklearn.metrics import make_scorer, f1_score
# Load dataset
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X, y = data.data, data.target
# Simple cross-validation
model = RandomForestClassifier(random_state=42)
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
# Stratified K-Fold (maintains class distribution)
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
stratified_scores = cross_val_score(model, X, y, cv=stratified_kfold, scoring='accuracy')
print(f"Stratified CV scores: {stratified_scores}")
print(f"Mean Stratified CV score: {stratified_scores.mean():.3f}")
# Multiple scoring metrics
scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
cv_results = cross_validate(model, X, y, cv=5, scoring=scoring)
for metric in scoring:
scores = cv_results[f'test_{metric}']
print(f"{metric}: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
# Custom scoring function
def custom_f1_score(y_true, y_pred):
return f1_score(y_true, y_pred, average='weighted')
custom_scorer = make_scorer(custom_f1_score)
custom_scores = cross_val_score(model, X, y, cv=5, scoring=custom_scorer)
print(f"Custom F1 scores: {custom_scores.mean():.3f}")
Performance Metrics
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
roc_auc_score, roc_curve, precision_recall_curve,
confusion_matrix, classification_report
)
import matplotlib.pyplot as plt
# Train a model
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
# Predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
# Basic metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-score: {f1:.3f}")
print(f"AUC-ROC: {auc:.3f}")
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc:.3f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True)
plt.show()
# Precision-Recall Curve
precision_vals, recall_vals, pr_thresholds = precision_recall_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(recall_vals, precision_vals, label='Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid(True)
plt.show()
# Detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))
This comprehensive guide covers the fundamental aspects of machine learning with scikit-learn. The library's consistent API and extensive documentation make it an excellent choice for both beginners and experienced practitioners. Remember to always validate your models properly, understand your data, and choose appropriate algorithms for your specific problem domain.