Produced by Claude 3.7 Sonnet, © 2025 Anthropic, PBC.
Fixed by Gemini 2.5 Pro, © 2025 Google, LLC.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25, random_state=42
)
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
model = LinearRegression()
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-cv_scores)
print(f"Mean RMSE: {rmse_scores.mean()}, Std: {rmse_scores.std()}")
Study Tips:
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
# Example with a single feature
X = np.array([[1], [2], [3], [4], [5]])
y = np.array([2, 3.5, 4.8, 6.3, 7.2])
# Create and fit the model
model = LinearRegression()
model.fit(X, y)
# Print model parameters
print(f"Intercept (β₀): {model.intercept_}")
print(f"Coefficient (β₁): {model.coef_[0]}")
# Make predictions
y_pred = model.predict(X)
# Visualize
plt.scatter(X, y, color='blue', label='Data')
plt.plot(X, y_pred, color='red', label='Linear fit')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()
from sklearn.linear_model import LinearRegression
import numpy as np
# Multiple features
X = np.array([
[1, 2, 3], # 3 features per sample
[4, 5, 6],
[7, 8, 9],
[10, 11, 12]
])
y = np.array([10, 20, 30, 40])
# Create and fit model
multi_model = LinearRegression()
multi_model.fit(X, y)
# Print model parameters
print(f"Intercept: {multi_model.intercept_}")
print(f"Coefficients: {multi_model.coef_}")
# Make a prediction
new_data = np.array([[5, 6, 7]])
prediction = multi_model.predict(new_data)
print(f"Prediction for {new_data}: {prediction}")
model.score(X, y)from sklearn.metrics import mean_squared_error, r2_score
# Calculate metrics
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse}")
print(f"R²: {r2}")
Study Tips:
PolynomialFeatures
to generate polynomial and interaction termsfrom sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt
# Generate non-linear data
X = np.sort(5 * np.random.rand(80, 1), axis=0)
y = np.sin(X).ravel() + np.random.normal(0, 0.1, X.shape[0])
# Create pipeline with polynomial features
polynomial_pipeline = Pipeline([
('poly', PolynomialFeatures(degree=3)),
('linear', LinearRegression())
])
# Fit the model
polynomial_pipeline.fit(X, y)
# Make predictions on a fine grid for plotting
X_test = np.linspace(0, 5, 100)[:, np.newaxis]
y_pred = polynomial_pipeline.predict(X_test)
# Plot results
plt.scatter(X, y, color='blue', label='Data')
plt.plot(X_test, y_pred, color='red', label='Polynomial fit (degree=3)')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()
FunctionTransformer for custom transformations
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
import numpy as np
# Create log transformation
log_transformer = FunctionTransformer(np.log1p, validate=True) # log(1+x) to handle zeros
# Create exponential data
X = np.array([[1], [2], [3], [4], [5]])
y = 2 * np.exp(0.5 * X.ravel()) + np.random.normal(0, 0.2, X.shape[0])
# Create and fit pipeline with log transform
log_pipeline = Pipeline([
('log', log_transformer),
('regression', LinearRegression())
])
log_pipeline.fit(X, y)
# Print results
print(f"Intercept: {log_pipeline.named_steps['regression'].intercept_}")
print(f"Coefficient: {log_pipeline.named_steps['regression'].coef_}")
Study Tips:
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
import matplotlib.pyplot as plt
# Generate sample data
X = np.sort(5 * np.random.rand(40, 1), axis=0)
y = np.sin(X).ravel() + np.random.normal(0, 0.1, X.shape[0])
# Create and fit KNN regressor
k = 3
knn_model = KNeighborsRegressor(n_neighbors=k)
knn_model.fit(X, y)
# Make predictions on a grid
X_test = np.linspace(0, 5, 100)[:, np.newaxis]
y_pred = knn_model.predict(X_test)
# Plot results
plt.scatter(X, y, color='blue', label='Data')
plt.plot(X_test, y_pred, color='red', label=f'KNN (k={k})')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
# Setup parameter grid
param_grid = {'n_neighbors': np.arange(1, 20)}
# Setup grid search
knn = KNeighborsRegressor()
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X, y)
# Print best parameters
print(f"Best k value: {grid_search.best_params_['n_neighbors']}")
print(f"Best RMSE: {np.sqrt(-grid_search.best_score_)}")
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
import numpy as np
# Load a dataset with multiple features
data = load_boston()
X, y = data.data, data.target
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Create pipeline with scaling
knn_pipeline = Pipeline([
('scaler', StandardScaler()),
('knn', KNeighborsRegressor(n_neighbors=5))
])
# Create pipeline without scaling
knn_no_scaling = Pipeline([
('knn', KNeighborsRegressor(n_neighbors=5))
])
# Fit and evaluate both models
knn_pipeline.fit(X_train, y_train)
knn_no_scaling.fit(X_train, y_train)
# Calculate RMSE
rmse_with_scaling = np.sqrt(mean_squared_error(y_test, knn_pipeline.predict(X_test)))
rmse_no_scaling = np.sqrt(mean_squared_error(y_test, knn_no_scaling.predict(X_test)))
print(f"RMSE with scaling: {rmse_with_scaling}")
print(f"RMSE without scaling: {rmse_no_scaling}")
Study Tips:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import numpy as np
import matplotlib.pyplot as plt
# Generate sample data
X = np.sort(5 * np.random.rand(80, 1), axis=0)
y = np.sin(X).ravel() + np.random.normal(0, 0.1, X.shape[0])
# Create and fit Decision Tree regressor
dt_model = DecisionTreeRegressor(max_depth=3, random_state=42)
dt_model.fit(X, y)
# Make predictions on a grid
X_test = np.linspace(0, 5, 100)[:, np.newaxis]
y_pred = dt_model.predict(X_test)
# Plot results
plt.scatter(X, y, color='blue', label='Data')
plt.plot(X_test, y_pred, color='red', label='Decision Tree (max_depth=3)')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()
# Export tree visualization (optional)
export_graphviz(dt_model, out_file='tree.dot',
feature_names=['x'],
filled=True,
rounded=True)
# Convert to PNG with: dot -Tpng tree.dot -o tree.png
max_depth: Maximum depth of the treemin_samples_split: Minimum samples required to split a
nodemin_samples_leaf: Minimum samples required in a leaf
nodemax_features: Maximum number of features to consider
when looking for the best splitccp_alpha parameterfrom sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
# Setup parameter grid for pre-pruning
param_grid = {
'max_depth': [2, 3, 4, 5, 6, None],
'min_samples_leaf': [1, 2, 4, 8],
'min_samples_split': [2, 4, 6, 8]
}
# Setup grid search
dt = DecisionTreeRegressor(random_state=42)
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X, y)
# Print best parameters
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best RMSE: {np.sqrt(-grid_search.best_score_)}")
Study Tips:
from sklearn.svm import SVR
import numpy as np
import matplotlib.pyplot as plt
# Generate sample data
X = np.sort(5 * np.random.rand(80, 1), axis=0)
y = np.sin(X).ravel() + np.random.normal(0, 0.1, X.shape[0])
# Create and fit SVR with linear kernel
svr_linear = SVR(kernel='linear', C=1.0, epsilon=0.1)
svr_linear.fit(X, y)
# Make predictions on a grid
X_test = np.linspace(0, 5, 100)[:, np.newaxis]
y_pred = svr_linear.predict(X_test)
# Plot results
plt.scatter(X, y, color='blue', label='Data')
plt.plot(X_test, y_pred, color='red', label='SVR (linear kernel)')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()
from sklearn.svm import SVR
import numpy as np
import matplotlib.pyplot as plt
# Generate nonlinear data
X = np.sort(5 * np.random.rand(80, 1), axis=0)
y = np.sin(X).ravel() + np.random.normal(0, 0.1, X.shape[0])
# Create and fit SVRs with different kernels
svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
svr_poly = SVR(kernel='poly', C=100, degree=3, epsilon=0.1)
svr_linear = SVR(kernel='linear', C=100, epsilon=0.1)
svr_rbf.fit(X, y)
svr_poly.fit(X, y)
svr_linear.fit(X, y)
# Make predictions
X_test = np.linspace(0, 5, 100)[:, np.newaxis]
y_rbf = svr_rbf.predict(X_test)
y_poly = svr_poly.predict(X_test)
y_linear = svr_linear.predict(X_test)
# Plot results
plt.scatter(X, y, color='blue', label='Data')
plt.plot(X_test, y_rbf, color='red', label='RBF kernel')
plt.plot(X_test, y_poly, color='green', label='Polynomial kernel')
plt.plot(X_test, y_linear, color='purple', label='Linear kernel')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# Create pipeline with scaling (important for SVR)
svr_pipeline = Pipeline([
('scaler', StandardScaler()),
('svr', SVR())
])
# Setup parameter grid
param_grid = {
'svr__kernel': ['rbf', 'linear'],
'svr__C': [0.1, 1, 10, 100],
'svr__epsilon': [0.01, 0.1, 0.2],
'svr__gamma': ['scale', 'auto', 0.1, 1]
}
# Setup grid search
grid_search = GridSearchCV(svr_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X, y)
# Print best parameters
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best RMSE: {np.sqrt(-grid_search.best_score_)}")
Study Tips:
from sklearn.linear_model import Ridge
import numpy as np
import matplotlib.pyplot as plt
# Generate sample data
X = np.random.randn(100, 10)
true_coef = np.array([3, 1.5, 0, 0, 2, 0, 0, 0, 0, 0])
y = X @ true_coef + np.random.randn(100) * 0.5
# Create and fit Ridge models with different alphas
alphas = [0, 0.1, 1.0, 10.0]
coefs = []
for alpha in alphas:
ridge = Ridge(alpha=alpha)
ridge.fit(X, y)
coefs.append(ridge.coef_)
# Plot coefficients for different alphas
plt.figure(figsize=(10, 6))
for i, alpha in enumerate(alphas):
plt.plot(range(10), coefs[i], 'o-', label=f'alpha = {alpha}')
plt.legend()
plt.xlabel('Coefficient index')
plt.ylabel('Coefficient value')
plt.title('Ridge coefficients as alpha varies')
plt.axhline(y=0, color='k', linestyle='--')
plt.show()
from sklearn.linear_model import Lasso
import numpy as np
import matplotlib.pyplot as plt
# Generate sample data with sparse coefficients
X = np.random.randn(100, 10)
true_coef = np.array([3, 1.5, 0, 0, 2, 0, 0, 0, 0, 0])
y = X @ true_coef + np.random.randn(100) * 0.5
# Create and fit Lasso models with different alphas
alphas = [0.001, 0.01, 0.1, 1.0]
coefs = []
for alpha in alphas:
lasso = Lasso(alpha=alpha, max_iter=10000)
lasso.fit(X, y)
coefs.append(lasso.coef_)
# Plot coefficients for different alphas
plt.figure(figsize=(10, 6))
for i, alpha in enumerate(alphas):
plt.plot(range(10), coefs[i], 'o-', label=f'alpha = {alpha}')
plt.legend()
plt.xlabel('Coefficient index')
plt.ylabel('Coefficient value')
plt.title('Lasso coefficients as alpha varies')
plt.axhline(y=0, color='k', linestyle='--')
plt.show()
alpha: Total regularization strengthl1_ratio: Proportion of L1 penalty (0 = Ridge, 1 =
Lasso)from sklearn.linear_model import ElasticNet
import numpy as np
# Generate sample data
X = np.random.randn(100, 10)
true_coef = np.array([3, 1.5, 0, 0, 2, 0, 0, 0, 0, 0])
y = X @ true_coef + np.random.randn(100) * 0.5
# Create and fit ElasticNet
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=10000)
elastic_net.fit(X, y)
# Print coefficients
print(f"ElasticNet coefficients: {elastic_net.coef_}")
# Compare with Lasso and Ridge
from sklearn.linear_model import Lasso, Ridge
lasso = Lasso(alpha=0.1, max_iter=10000)
ridge = Ridge(alpha=0.1)
lasso.fit(X, y)
ridge.fit(X, y)
print(f"Lasso coefficients: {lasso.coef_}")
print(f"Ridge coefficients: {ridge.coef_}")
from sklearn.linear_model import Ridge
from sklearn.model_selection import validation_curve
import numpy as np
import matplotlib.pyplot as plt
# Generate data
X = np.random.randn(100, 5)
true_coef = np.array([3, 1.5, 0, 2, 0.5])
y = X @ true_coef + np.random.randn(100) * 0.5
# Calculate validation curve
param_range = np.logspace(-3, 3, 10)
train_scores, test_scores = validation_curve(
Ridge(), X, y, param_name="alpha", param_range=param_range,
cv=5, scoring="neg_mean_squared_error"
)
# Convert to RMSE
train_rmse = np.sqrt(-train_scores).mean(axis=1)
test_rmse = np.sqrt(-test_scores).mean(axis=1)
# Plot validation curve
plt.figure(figsize=(10, 6))
plt.semilogx(param_range, train_rmse, label="Training RMSE")
plt.semilogx(param_range, test_rmse, label="Validation RMSE")
plt.xlabel("alpha")
plt.ylabel("Root Mean Squared Error")
plt.legend()
plt.title("Validation Curve for Ridge Regression")
plt.grid()
plt.show()
# Grid search example
from sklearn.model_selection import GridSearchCV
param_grid = {
'alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}
grid_search = GridSearchCV(
Ridge(), param_grid, cv=5, scoring='neg_mean_squared_error'
)
grid_search.fit(X, y)
print(f"Best alpha: {grid_search.best_params_['alpha']}")
print(f"Best RMSE: {np.sqrt(-grid_search.best_score_)}")
Study Tips:
OneHotEncoder in
scikit-learnfrom sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd
# Sample categorical data
data = np.array([['Male', 'Small'], ['Female', 'Medium'], ['Female', 'Large'], ['Male', 'Medium']])
df = pd.DataFrame(data, columns=['Gender', 'Size'])
# One-hot encoding
encoder = OneHotEncoder(sparse_output=False)
encoded_data = encoder.fit_transform(df)
# Create DataFrame with feature names
encoded_df = pd.DataFrame(
encoded_data,
columns=encoder.get_feature_names_out(['Gender', 'Size'])
)
print("Original data:")
print(df)
print("\nOne-hot encoded data:")
print(encoded_df)
KBinsDiscretizer
in scikit-learnfrom sklearn.preprocessing import KBinsDiscretizer
import numpy as np
import matplotlib.pyplot as plt
# Generate continuous data
X = np.random.randn(100, 1) * 3 + 5 # Mean = 5, Std = 3
# Create different binning strategies
n_bins = 5
discretizers = [
('uniform', KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')),
('quantile', KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile')),
('kmeans', KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='kmeans'))
]
# Apply discretization
plt.figure(figsize=(15, 10))
for i, (strategy, discretizer) in enumerate(discretizers):
X_binned = discretizer.fit_transform(X)
# Plot original data vs binned data
plt.subplot(3, 1, i+1)
plt.scatter(X, X_binned)
plt.xlabel('Original Value')
plt.ylabel('Bin')
plt.title(f'Binning with {strategy} strategy')
# Add bin edges for uniform and quantile
if strategy in ['uniform', 'quantile']:
for edge in discretizer.bin_edges_[0]:
plt.axvline(edge, color='r', linestyle='--', alpha=0.3)
plt.tight_layout()
plt.show()
PolynomialFeatures with
degree=2, interaction_only=True
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import pandas as pd
# Sample data with two features
X = np.array([[1, 2], [3, 4], [5, 6]])
df = pd.DataFrame(X, columns=['Feature1', 'Feature2'])
# Create interaction terms
interaction = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
X_interaction = interaction.fit_transform(X)
# Create DataFrame with feature names
interaction_df = pd.DataFrame(
X_interaction,
columns=interaction.get_feature_names_out(['Feature1', 'Feature2'])
)
print("Original data:")
print(df)
print("\nWith interaction terms:")
print(interaction_df)
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import numpy as np
import matplotlib.pyplot as plt
# Generate data with outliers
X = np.random.randn(100, 1)
X[0] = 10 # Add an outlier
# Apply different scaling methods
scalers = [
('Standard', StandardScaler()),
('MinMax', MinMaxScaler()),
('Robust', RobustScaler())
]
plt.figure(figsize=(15, 10))
# Plot original data
plt.subplot(4, 1, 1)
plt.hist(X, bins=30)
plt.title('Original Data')
# Plot scaled data for each scaler
for i, (name, scaler) in enumerate(scalers):
X_scaled = scaler.fit_transform(X)
plt.subplot(4, 1, i+2)
plt.hist(X_scaled, bins=30)
plt.title(f'{name} Scaled Data')
plt.tight_layout()
plt.show()
# Compare specific values
sample = np.array([[0], [1], [10]]) # mean, 1 std dev, outlier
print("Original values:", sample.ravel())
for name, scaler in scalers:
scaler.fit(X)
scaled = scaler.transform(sample)
print(f"{name} scaled:", scaled.ravel())
Study Tips:
fit(X, y): Train model on datapredict(X): Make predictions for new datascore(X, y): Calculate model performancetransform(X): Apply data transformationfit and transform
methodsfit and predict
methodsfit_transform for
efficiencyfrom sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
# Generate sample data
X = np.random.rand(100, 2)
y = 3*X[:, 0] + 2*X[:, 1] + np.random.randn(100) * 0.1
# Example of transformer API
scaler = StandardScaler()
scaler.fit(X) # Learn parameters (mean, std)
X_scaled = scaler.transform(X) # Apply transformation
# Example of predictor API
model = LinearRegression()
model.fit(X_scaled, y) # Train model
y_pred = model.predict(X_scaled) # Make predictions
r2 = model.score(X_scaled, y) # Calculate R²
print(f"Model coefficients: {model.coef_}")
print(f"Model intercept: {model.intercept_}")
print(f"R² score: {r2}")
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge
import numpy as np
from sklearn.model_selection import train_test_split
# Generate non-linear data
X = np.random.rand(100, 1)
y = np.sin(2 * np.pi * X.ravel()) + np.random.randn(100) * 0.1
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a pipeline with preprocessing and model
pipeline = Pipeline([
('poly', PolynomialFeatures(degree=3)),
('scaler', StandardScaler()),
('ridge', Ridge(alpha=0.1))
])
# Train and evaluate in one step
pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
print(f"Pipeline R² score: {score}")
# Access individual steps
print(f"Polynomial features shape: {pipeline.named_steps['poly'].n_output_features_}")
print(f"Ridge coefficients: {pipeline.named_steps['ridge'].coef_}")
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
import numpy as np
# Generate non-linear data
X = np.random.rand(100, 1)
y = np.sin(2 * np.pi * X.ravel()) + np.random.randn(100) * 0.1
# Create pipeline
pipeline = Pipeline([
('poly', PolynomialFeatures()),
('scaler', StandardScaler()),
('ridge', Ridge())
])
# Parameter grid
param_grid = {
'poly__degree': [1, 2, 3, 4],
'ridge__alpha': [0.001, 0.01, 0.1, 1.0, 10.0]
}
# Grid search
grid_search = GridSearchCV(
pipeline, param_grid, cv=5, scoring='neg_mean_squared_error'
)
grid_search.fit(X, y)
# Print results
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best RMSE: {np.sqrt(-grid_search.best_score_)}")
# Make predictions with best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X)
# Plot results
import matplotlib.pyplot as plt
X_sorted = np.sort(X, axis=0)
y_pred_sorted = best_model.predict(X_sorted)
plt.scatter(X, y, color='blue', label='Data')
plt.plot(X_sorted, y_pred_sorted, color='red', label='Best model fit')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.title('Best Pipeline Model')
plt.show()
Study Tips:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Load data from CSV (example)
# df = pd.read_csv('your_data.csv')
# Create sample data
np.random.seed(42)
n_samples = 1000
n_features = 5
X = np.random.randn(n_samples, n_features)
y = 2*X[:, 0] + 3*X[:, 1] - X[:, 2] + 0.5*X[:, 3] + np.random.randn(n_samples) * 0.5
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
from sklearn.model_selection import cross_val_score, KFold, validation_curve
from sklearn.linear_model import Ridge
import numpy as np
import matplotlib.pyplot as plt
# Generate sample data
X = np.random.randn(100, 5)
y = 2*X[:, 0] + 3*X[:, 1] - X[:, 2] + np.random.randn(100) * 0.5
# Basic cross-validation
model = Ridge(alpha=1.0)
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-cv_scores)
print(f"Cross-validation RMSE: {rmse_scores.mean()} ± {rmse_scores.std()}")
# Custom KFold
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_custom = cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_squared_error')
rmse_custom = np.sqrt(-cv_custom)
print(f"Custom KFold RMSE: {rmse_custom.mean()} ± {rmse_custom.std()}")
# Validation curve
param_range = np.logspace(-3, 3, 10)
train_scores, test_scores = validation_curve(
Ridge(), X, y, param_name="alpha", param_range=param_range,
cv=5, scoring="neg_mean_squared_error"
)
# Plot validation curve
plt.figure(figsize=(10, 6))
plt.semilogx(param_range, np.sqrt(-train_scores).mean(axis=1),
label="Training RMSE")
plt.semilogx(param_range, np.sqrt(-test_scores).mean(axis=1),
label="Validation RMSE")
plt.xlabel("alpha")
plt.ylabel("RMSE")
plt.legend()
plt.title("Validation Curve for Ridge Regression")
plt.grid()
plt.show()
from sklearn.model_selection import learning_curve
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt
# Generate sample data
X = np.random.randn(200, 5)
y = 2*X[:, 0] + 3*X[:, 1] - X[:, 2] + np.random.randn(200) * 0.5
# Learning curve
train_sizes, train_scores, test_scores = learning_curve(
LinearRegression(), X, y, cv=5, scoring='neg_mean_squared_error',
train_sizes=np.linspace(0.1, 1.0, 10)
)
# Calculate RMSE
train_rmse = np.sqrt(-train_scores).mean(axis=1)
test_rmse = np.sqrt(-test_scores).mean(axis=1)
# Plot learning curve
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_rmse, 'o-', color='r', label='Training RMSE')
plt.plot(train_sizes, test_rmse, 'o-', color='g', label='Validation RMSE')
plt.xlabel('Training set size')
plt.ylabel('RMSE')
plt.title('Learning Curve for Linear Regression')
plt.legend()
plt.grid()
plt.show()
# Fit model for residual plot
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
residuals = y - y_pred
# Residual plot
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='-')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.grid()
plt.show()
# Feature importance plot
plt.figure(figsize=(10, 6))
plt.bar(range(X.shape[1]), model.coef_)
plt.xlabel('Feature index')
plt.ylabel('Coefficient value')
plt.title('Feature Importance')
plt.xticks(range(X.shape[1]))
plt.grid(axis='y')
plt.show()
Study Tips:
Q: What are the key differences between Ridge, Lasso, and Elastic Net regularization?
A:
Q: Explain the ε-insensitive loss function in SVR and how the key parameters affect the
model.
A:
Q: What is k-fold cross-validation and why is it important?
A:
Q: Compare and contrast Decision Tree Regression and k-Nearest Neighbors Regression.
A:
Q: What is data leakage and how do scikit-learn Pipelines help prevent it?
A:
Q: How do you interpret learning curves to diagnose
model problems?
A:
Q: What are the main types of feature engineering
techniques and when should you use them?
A: