Produced by Claude 3.7 Sonnet, © 2025 Anthropic, PBC.
Fixed by Gemini 2.5 Pro, © 2025 Google, LLC.
from sklearn.model_selection import train_test_split
= train_test_split(
X_train, X_test, y_train, y_test =0.25, random_state=42
X, y, test_size )
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
= LinearRegression()
model = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
cv_scores = np.sqrt(-cv_scores)
rmse_scores print(f"Mean RMSE: {rmse_scores.mean()}, Std: {rmse_scores.std()}")
Study Tips:
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
# Example with a single feature
= np.array([[1], [2], [3], [4], [5]])
X = np.array([2, 3.5, 4.8, 6.3, 7.2])
y
# Create and fit the model
= LinearRegression()
model
model.fit(X, y)
# Print model parameters
print(f"Intercept (β₀): {model.intercept_}")
print(f"Coefficient (β₁): {model.coef_[0]}")
# Make predictions
= model.predict(X)
y_pred
# Visualize
='blue', label='Data')
plt.scatter(X, y, color='red', label='Linear fit')
plt.plot(X, y_pred, color'x')
plt.xlabel('y')
plt.ylabel(
plt.legend() plt.show()
from sklearn.linear_model import LinearRegression
import numpy as np
# Multiple features
= np.array([
X 1, 2, 3], # 3 features per sample
[4, 5, 6],
[7, 8, 9],
[10, 11, 12]
[
])= np.array([10, 20, 30, 40])
y
# Create and fit model
= LinearRegression()
multi_model
multi_model.fit(X, y)
# Print model parameters
print(f"Intercept: {multi_model.intercept_}")
print(f"Coefficients: {multi_model.coef_}")
# Make a prediction
= np.array([[5, 6, 7]])
new_data = multi_model.predict(new_data)
prediction print(f"Prediction for {new_data}: {prediction}")
model.score(X, y)
from sklearn.metrics import mean_squared_error, r2_score
# Calculate metrics
= model.predict(X_test)
y_pred = np.sqrt(mean_squared_error(y_test, y_pred))
rmse = r2_score(y_test, y_pred)
r2
print(f"RMSE: {rmse}")
print(f"R²: {r2}")
Study Tips:
PolynomialFeatures
to generate polynomial and interaction termsfrom sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt
# Generate non-linear data
= np.sort(5 * np.random.rand(80, 1), axis=0)
X = np.sin(X).ravel() + np.random.normal(0, 0.1, X.shape[0])
y
# Create pipeline with polynomial features
= Pipeline([
polynomial_pipeline 'poly', PolynomialFeatures(degree=3)),
('linear', LinearRegression())
(
])
# Fit the model
polynomial_pipeline.fit(X, y)
# Make predictions on a fine grid for plotting
= np.linspace(0, 5, 100)[:, np.newaxis]
X_test = polynomial_pipeline.predict(X_test)
y_pred
# Plot results
='blue', label='Data')
plt.scatter(X, y, color='red', label='Polynomial fit (degree=3)')
plt.plot(X_test, y_pred, color'x')
plt.xlabel('y')
plt.ylabel(
plt.legend() plt.show()
FunctionTransformer
for custom transformations
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
import numpy as np
# Create log transformation
= FunctionTransformer(np.log1p, validate=True) # log(1+x) to handle zeros
log_transformer
# Create exponential data
= np.array([[1], [2], [3], [4], [5]])
X = 2 * np.exp(0.5 * X.ravel()) + np.random.normal(0, 0.2, X.shape[0])
y
# Create and fit pipeline with log transform
= Pipeline([
log_pipeline 'log', log_transformer),
('regression', LinearRegression())
(
])
log_pipeline.fit(X, y)
# Print results
print(f"Intercept: {log_pipeline.named_steps['regression'].intercept_}")
print(f"Coefficient: {log_pipeline.named_steps['regression'].coef_}")
Study Tips:
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
import matplotlib.pyplot as plt
# Generate sample data
= np.sort(5 * np.random.rand(40, 1), axis=0)
X = np.sin(X).ravel() + np.random.normal(0, 0.1, X.shape[0])
y
# Create and fit KNN regressor
= 3
k = KNeighborsRegressor(n_neighbors=k)
knn_model
knn_model.fit(X, y)
# Make predictions on a grid
= np.linspace(0, 5, 100)[:, np.newaxis]
X_test = knn_model.predict(X_test)
y_pred
# Plot results
='blue', label='Data')
plt.scatter(X, y, color='red', label=f'KNN (k={k})')
plt.plot(X_test, y_pred, color'x')
plt.xlabel('y')
plt.ylabel(
plt.legend() plt.show()
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
# Setup parameter grid
= {'n_neighbors': np.arange(1, 20)}
param_grid
# Setup grid search
= KNeighborsRegressor()
knn = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search
grid_search.fit(X, y)
# Print best parameters
print(f"Best k value: {grid_search.best_params_['n_neighbors']}")
print(f"Best RMSE: {np.sqrt(-grid_search.best_score_)}")
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
import numpy as np
# Load a dataset with multiple features
= load_boston()
data = data.data, data.target
X, y
# Split data
= train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_test, y_train, y_test
# Create pipeline with scaling
= Pipeline([
knn_pipeline 'scaler', StandardScaler()),
('knn', KNeighborsRegressor(n_neighbors=5))
(
])
# Create pipeline without scaling
= Pipeline([
knn_no_scaling 'knn', KNeighborsRegressor(n_neighbors=5))
(
])
# Fit and evaluate both models
knn_pipeline.fit(X_train, y_train)
knn_no_scaling.fit(X_train, y_train)
# Calculate RMSE
= np.sqrt(mean_squared_error(y_test, knn_pipeline.predict(X_test)))
rmse_with_scaling = np.sqrt(mean_squared_error(y_test, knn_no_scaling.predict(X_test)))
rmse_no_scaling
print(f"RMSE with scaling: {rmse_with_scaling}")
print(f"RMSE without scaling: {rmse_no_scaling}")
Study Tips:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import numpy as np
import matplotlib.pyplot as plt
# Generate sample data
= np.sort(5 * np.random.rand(80, 1), axis=0)
X = np.sin(X).ravel() + np.random.normal(0, 0.1, X.shape[0])
y
# Create and fit Decision Tree regressor
= DecisionTreeRegressor(max_depth=3, random_state=42)
dt_model
dt_model.fit(X, y)
# Make predictions on a grid
= np.linspace(0, 5, 100)[:, np.newaxis]
X_test = dt_model.predict(X_test)
y_pred
# Plot results
='blue', label='Data')
plt.scatter(X, y, color='red', label='Decision Tree (max_depth=3)')
plt.plot(X_test, y_pred, color'x')
plt.xlabel('y')
plt.ylabel(
plt.legend()
plt.show()
# Export tree visualization (optional)
='tree.dot',
export_graphviz(dt_model, out_file=['x'],
feature_names=True,
filled=True)
rounded# Convert to PNG with: dot -Tpng tree.dot -o tree.png
max_depth
: Maximum depth of the treemin_samples_split
: Minimum samples required to split a
nodemin_samples_leaf
: Minimum samples required in a leaf
nodemax_features
: Maximum number of features to consider
when looking for the best splitccp_alpha
parameterfrom sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
# Setup parameter grid for pre-pruning
= {
param_grid 'max_depth': [2, 3, 4, 5, 6, None],
'min_samples_leaf': [1, 2, 4, 8],
'min_samples_split': [2, 4, 6, 8]
}
# Setup grid search
= DecisionTreeRegressor(random_state=42)
dt = GridSearchCV(dt, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search
grid_search.fit(X, y)
# Print best parameters
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best RMSE: {np.sqrt(-grid_search.best_score_)}")
Study Tips:
from sklearn.svm import SVR
import numpy as np
import matplotlib.pyplot as plt
# Generate sample data
= np.sort(5 * np.random.rand(80, 1), axis=0)
X = np.sin(X).ravel() + np.random.normal(0, 0.1, X.shape[0])
y
# Create and fit SVR with linear kernel
= SVR(kernel='linear', C=1.0, epsilon=0.1)
svr_linear
svr_linear.fit(X, y)
# Make predictions on a grid
= np.linspace(0, 5, 100)[:, np.newaxis]
X_test = svr_linear.predict(X_test)
y_pred
# Plot results
='blue', label='Data')
plt.scatter(X, y, color='red', label='SVR (linear kernel)')
plt.plot(X_test, y_pred, color'x')
plt.xlabel('y')
plt.ylabel(
plt.legend() plt.show()
from sklearn.svm import SVR
import numpy as np
import matplotlib.pyplot as plt
# Generate nonlinear data
= np.sort(5 * np.random.rand(80, 1), axis=0)
X = np.sin(X).ravel() + np.random.normal(0, 0.1, X.shape[0])
y
# Create and fit SVRs with different kernels
= SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
svr_rbf = SVR(kernel='poly', C=100, degree=3, epsilon=0.1)
svr_poly = SVR(kernel='linear', C=100, epsilon=0.1)
svr_linear
svr_rbf.fit(X, y)
svr_poly.fit(X, y)
svr_linear.fit(X, y)
# Make predictions
= np.linspace(0, 5, 100)[:, np.newaxis]
X_test = svr_rbf.predict(X_test)
y_rbf = svr_poly.predict(X_test)
y_poly = svr_linear.predict(X_test)
y_linear
# Plot results
='blue', label='Data')
plt.scatter(X, y, color='red', label='RBF kernel')
plt.plot(X_test, y_rbf, color='green', label='Polynomial kernel')
plt.plot(X_test, y_poly, color='purple', label='Linear kernel')
plt.plot(X_test, y_linear, color'x')
plt.xlabel('y')
plt.ylabel(
plt.legend() plt.show()
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# Create pipeline with scaling (important for SVR)
= Pipeline([
svr_pipeline 'scaler', StandardScaler()),
('svr', SVR())
(
])
# Setup parameter grid
= {
param_grid 'svr__kernel': ['rbf', 'linear'],
'svr__C': [0.1, 1, 10, 100],
'svr__epsilon': [0.01, 0.1, 0.2],
'svr__gamma': ['scale', 'auto', 0.1, 1]
}
# Setup grid search
= GridSearchCV(svr_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search
grid_search.fit(X, y)
# Print best parameters
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best RMSE: {np.sqrt(-grid_search.best_score_)}")
Study Tips:
from sklearn.linear_model import Ridge
import numpy as np
import matplotlib.pyplot as plt
# Generate sample data
= np.random.randn(100, 10)
X = np.array([3, 1.5, 0, 0, 2, 0, 0, 0, 0, 0])
true_coef = X @ true_coef + np.random.randn(100) * 0.5
y
# Create and fit Ridge models with different alphas
= [0, 0.1, 1.0, 10.0]
alphas = []
coefs
for alpha in alphas:
= Ridge(alpha=alpha)
ridge
ridge.fit(X, y)
coefs.append(ridge.coef_)
# Plot coefficients for different alphas
=(10, 6))
plt.figure(figsizefor i, alpha in enumerate(alphas):
range(10), coefs[i], 'o-', label=f'alpha = {alpha}')
plt.plot(
plt.legend()'Coefficient index')
plt.xlabel('Coefficient value')
plt.ylabel('Ridge coefficients as alpha varies')
plt.title(=0, color='k', linestyle='--')
plt.axhline(y plt.show()
from sklearn.linear_model import Lasso
import numpy as np
import matplotlib.pyplot as plt
# Generate sample data with sparse coefficients
= np.random.randn(100, 10)
X = np.array([3, 1.5, 0, 0, 2, 0, 0, 0, 0, 0])
true_coef = X @ true_coef + np.random.randn(100) * 0.5
y
# Create and fit Lasso models with different alphas
= [0.001, 0.01, 0.1, 1.0]
alphas = []
coefs
for alpha in alphas:
= Lasso(alpha=alpha, max_iter=10000)
lasso
lasso.fit(X, y)
coefs.append(lasso.coef_)
# Plot coefficients for different alphas
=(10, 6))
plt.figure(figsizefor i, alpha in enumerate(alphas):
range(10), coefs[i], 'o-', label=f'alpha = {alpha}')
plt.plot(
plt.legend()'Coefficient index')
plt.xlabel('Coefficient value')
plt.ylabel('Lasso coefficients as alpha varies')
plt.title(=0, color='k', linestyle='--')
plt.axhline(y plt.show()
alpha
: Total regularization strengthl1_ratio
: Proportion of L1 penalty (0 = Ridge, 1 =
Lasso)from sklearn.linear_model import ElasticNet
import numpy as np
# Generate sample data
= np.random.randn(100, 10)
X = np.array([3, 1.5, 0, 0, 2, 0, 0, 0, 0, 0])
true_coef = X @ true_coef + np.random.randn(100) * 0.5
y
# Create and fit ElasticNet
= ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=10000)
elastic_net
elastic_net.fit(X, y)
# Print coefficients
print(f"ElasticNet coefficients: {elastic_net.coef_}")
# Compare with Lasso and Ridge
from sklearn.linear_model import Lasso, Ridge
= Lasso(alpha=0.1, max_iter=10000)
lasso = Ridge(alpha=0.1)
ridge
lasso.fit(X, y)
ridge.fit(X, y)
print(f"Lasso coefficients: {lasso.coef_}")
print(f"Ridge coefficients: {ridge.coef_}")
from sklearn.linear_model import Ridge
from sklearn.model_selection import validation_curve
import numpy as np
import matplotlib.pyplot as plt
# Generate data
= np.random.randn(100, 5)
X = np.array([3, 1.5, 0, 2, 0.5])
true_coef = X @ true_coef + np.random.randn(100) * 0.5
y
# Calculate validation curve
= np.logspace(-3, 3, 10)
param_range = validation_curve(
train_scores, test_scores ="alpha", param_range=param_range,
Ridge(), X, y, param_name=5, scoring="neg_mean_squared_error"
cv
)
# Convert to RMSE
= np.sqrt(-train_scores).mean(axis=1)
train_rmse = np.sqrt(-test_scores).mean(axis=1)
test_rmse
# Plot validation curve
=(10, 6))
plt.figure(figsize="Training RMSE")
plt.semilogx(param_range, train_rmse, label="Validation RMSE")
plt.semilogx(param_range, test_rmse, label"alpha")
plt.xlabel("Root Mean Squared Error")
plt.ylabel(
plt.legend()"Validation Curve for Ridge Regression")
plt.title(
plt.grid()
plt.show()
# Grid search example
from sklearn.model_selection import GridSearchCV
= {
param_grid 'alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}
= GridSearchCV(
grid_search =5, scoring='neg_mean_squared_error'
Ridge(), param_grid, cv
)
grid_search.fit(X, y)
print(f"Best alpha: {grid_search.best_params_['alpha']}")
print(f"Best RMSE: {np.sqrt(-grid_search.best_score_)}")
Study Tips:
OneHotEncoder
in
scikit-learnfrom sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd
# Sample categorical data
= np.array([['Male', 'Small'], ['Female', 'Medium'], ['Female', 'Large'], ['Male', 'Medium']])
data = pd.DataFrame(data, columns=['Gender', 'Size'])
df
# One-hot encoding
= OneHotEncoder(sparse_output=False)
encoder = encoder.fit_transform(df)
encoded_data
# Create DataFrame with feature names
= pd.DataFrame(
encoded_df
encoded_data,=encoder.get_feature_names_out(['Gender', 'Size'])
columns
)
print("Original data:")
print(df)
print("\nOne-hot encoded data:")
print(encoded_df)
KBinsDiscretizer
in scikit-learnfrom sklearn.preprocessing import KBinsDiscretizer
import numpy as np
import matplotlib.pyplot as plt
# Generate continuous data
= np.random.randn(100, 1) * 3 + 5 # Mean = 5, Std = 3
X
# Create different binning strategies
= 5
n_bins = [
discretizers 'uniform', KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')),
('quantile', KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile')),
('kmeans', KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='kmeans'))
(
]
# Apply discretization
=(15, 10))
plt.figure(figsizefor i, (strategy, discretizer) in enumerate(discretizers):
= discretizer.fit_transform(X)
X_binned
# Plot original data vs binned data
3, 1, i+1)
plt.subplot(
plt.scatter(X, X_binned)'Original Value')
plt.xlabel('Bin')
plt.ylabel(f'Binning with {strategy} strategy')
plt.title(
# Add bin edges for uniform and quantile
if strategy in ['uniform', 'quantile']:
for edge in discretizer.bin_edges_[0]:
='r', linestyle='--', alpha=0.3)
plt.axvline(edge, color
plt.tight_layout() plt.show()
PolynomialFeatures
with
degree=2, interaction_only=True
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import pandas as pd
# Sample data with two features
= np.array([[1, 2], [3, 4], [5, 6]])
X = pd.DataFrame(X, columns=['Feature1', 'Feature2'])
df
# Create interaction terms
= PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
interaction = interaction.fit_transform(X)
X_interaction
# Create DataFrame with feature names
= pd.DataFrame(
interaction_df
X_interaction, =interaction.get_feature_names_out(['Feature1', 'Feature2'])
columns
)
print("Original data:")
print(df)
print("\nWith interaction terms:")
print(interaction_df)
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import numpy as np
import matplotlib.pyplot as plt
# Generate data with outliers
= np.random.randn(100, 1)
X 0] = 10 # Add an outlier
X[
# Apply different scaling methods
= [
scalers 'Standard', StandardScaler()),
('MinMax', MinMaxScaler()),
('Robust', RobustScaler())
(
]
=(15, 10))
plt.figure(figsize
# Plot original data
4, 1, 1)
plt.subplot(=30)
plt.hist(X, bins'Original Data')
plt.title(
# Plot scaled data for each scaler
for i, (name, scaler) in enumerate(scalers):
= scaler.fit_transform(X)
X_scaled
4, 1, i+2)
plt.subplot(=30)
plt.hist(X_scaled, binsf'{name} Scaled Data')
plt.title(
plt.tight_layout()
plt.show()
# Compare specific values
= np.array([[0], [1], [10]]) # mean, 1 std dev, outlier
sample print("Original values:", sample.ravel())
for name, scaler in scalers:
scaler.fit(X)= scaler.transform(sample)
scaled print(f"{name} scaled:", scaled.ravel())
Study Tips:
fit(X, y)
: Train model on datapredict(X)
: Make predictions for new datascore(X, y)
: Calculate model performancetransform(X)
: Apply data transformationfit
and transform
methodsfit
and predict
methodsfit_transform
for
efficiencyfrom sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
# Generate sample data
= np.random.rand(100, 2)
X = 3*X[:, 0] + 2*X[:, 1] + np.random.randn(100) * 0.1
y
# Example of transformer API
= StandardScaler()
scaler # Learn parameters (mean, std)
scaler.fit(X) = scaler.transform(X) # Apply transformation
X_scaled
# Example of predictor API
= LinearRegression()
model # Train model
model.fit(X_scaled, y) = model.predict(X_scaled) # Make predictions
y_pred = model.score(X_scaled, y) # Calculate R²
r2
print(f"Model coefficients: {model.coef_}")
print(f"Model intercept: {model.intercept_}")
print(f"R² score: {r2}")
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge
import numpy as np
from sklearn.model_selection import train_test_split
# Generate non-linear data
= np.random.rand(100, 1)
X = np.sin(2 * np.pi * X.ravel()) + np.random.randn(100) * 0.1
y
# Split data
= train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test
# Create a pipeline with preprocessing and model
= Pipeline([
pipeline 'poly', PolynomialFeatures(degree=3)),
('scaler', StandardScaler()),
('ridge', Ridge(alpha=0.1))
(
])
# Train and evaluate in one step
pipeline.fit(X_train, y_train)= pipeline.score(X_test, y_test)
score
print(f"Pipeline R² score: {score}")
# Access individual steps
print(f"Polynomial features shape: {pipeline.named_steps['poly'].n_output_features_}")
print(f"Ridge coefficients: {pipeline.named_steps['ridge'].coef_}")
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
import numpy as np
# Generate non-linear data
= np.random.rand(100, 1)
X = np.sin(2 * np.pi * X.ravel()) + np.random.randn(100) * 0.1
y
# Create pipeline
= Pipeline([
pipeline 'poly', PolynomialFeatures()),
('scaler', StandardScaler()),
('ridge', Ridge())
(
])
# Parameter grid
= {
param_grid 'poly__degree': [1, 2, 3, 4],
'ridge__alpha': [0.001, 0.01, 0.1, 1.0, 10.0]
}
# Grid search
= GridSearchCV(
grid_search =5, scoring='neg_mean_squared_error'
pipeline, param_grid, cv
)
grid_search.fit(X, y)
# Print results
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best RMSE: {np.sqrt(-grid_search.best_score_)}")
# Make predictions with best model
= grid_search.best_estimator_
best_model = best_model.predict(X)
y_pred
# Plot results
import matplotlib.pyplot as plt
= np.sort(X, axis=0)
X_sorted = best_model.predict(X_sorted)
y_pred_sorted
='blue', label='Data')
plt.scatter(X, y, color='red', label='Best model fit')
plt.plot(X_sorted, y_pred_sorted, color'x')
plt.xlabel('y')
plt.ylabel(
plt.legend()'Best Pipeline Model')
plt.title( plt.show()
Study Tips:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Load data from CSV (example)
# df = pd.read_csv('your_data.csv')
# Create sample data
42)
np.random.seed(= 1000
n_samples = 5
n_features = np.random.randn(n_samples, n_features)
X = 2*X[:, 0] + 3*X[:, 1] - X[:, 2] + 0.5*X[:, 3] + np.random.randn(n_samples) * 0.5
y
# Split data into training and test sets
= train_test_split(
X_train, X_test, y_train, y_test =0.2, random_state=42
X, y, test_size
)
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
from sklearn.model_selection import cross_val_score, KFold, validation_curve
from sklearn.linear_model import Ridge
import numpy as np
import matplotlib.pyplot as plt
# Generate sample data
= np.random.randn(100, 5)
X = 2*X[:, 0] + 3*X[:, 1] - X[:, 2] + np.random.randn(100) * 0.5
y
# Basic cross-validation
= Ridge(alpha=1.0)
model = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
cv_scores = np.sqrt(-cv_scores)
rmse_scores print(f"Cross-validation RMSE: {rmse_scores.mean()} ± {rmse_scores.std()}")
# Custom KFold
= KFold(n_splits=5, shuffle=True, random_state=42)
kfold = cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_squared_error')
cv_custom = np.sqrt(-cv_custom)
rmse_custom print(f"Custom KFold RMSE: {rmse_custom.mean()} ± {rmse_custom.std()}")
# Validation curve
= np.logspace(-3, 3, 10)
param_range = validation_curve(
train_scores, test_scores ="alpha", param_range=param_range,
Ridge(), X, y, param_name=5, scoring="neg_mean_squared_error"
cv
)
# Plot validation curve
=(10, 6))
plt.figure(figsize-train_scores).mean(axis=1),
plt.semilogx(param_range, np.sqrt(="Training RMSE")
label-test_scores).mean(axis=1),
plt.semilogx(param_range, np.sqrt(="Validation RMSE")
label"alpha")
plt.xlabel("RMSE")
plt.ylabel(
plt.legend()"Validation Curve for Ridge Regression")
plt.title(
plt.grid() plt.show()
from sklearn.model_selection import learning_curve
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt
# Generate sample data
= np.random.randn(200, 5)
X = 2*X[:, 0] + 3*X[:, 1] - X[:, 2] + np.random.randn(200) * 0.5
y
# Learning curve
= learning_curve(
train_sizes, train_scores, test_scores =5, scoring='neg_mean_squared_error',
LinearRegression(), X, y, cv=np.linspace(0.1, 1.0, 10)
train_sizes
)
# Calculate RMSE
= np.sqrt(-train_scores).mean(axis=1)
train_rmse = np.sqrt(-test_scores).mean(axis=1)
test_rmse
# Plot learning curve
=(10, 6))
plt.figure(figsize'o-', color='r', label='Training RMSE')
plt.plot(train_sizes, train_rmse, 'o-', color='g', label='Validation RMSE')
plt.plot(train_sizes, test_rmse, 'Training set size')
plt.xlabel('RMSE')
plt.ylabel('Learning Curve for Linear Regression')
plt.title(
plt.legend()
plt.grid()
plt.show()
# Fit model for residual plot
= LinearRegression()
model
model.fit(X, y)= model.predict(X)
y_pred = y - y_pred
residuals
# Residual plot
=(10, 6))
plt.figure(figsize
plt.scatter(y_pred, residuals)=0, color='r', linestyle='-')
plt.axhline(y'Predicted values')
plt.xlabel('Residuals')
plt.ylabel('Residual Plot')
plt.title(
plt.grid()
plt.show()
# Feature importance plot
=(10, 6))
plt.figure(figsizerange(X.shape[1]), model.coef_)
plt.bar('Feature index')
plt.xlabel('Coefficient value')
plt.ylabel('Feature Importance')
plt.title(range(X.shape[1]))
plt.xticks(='y')
plt.grid(axis plt.show()
Study Tips:
Q: What are the key differences between Ridge, Lasso, and Elastic Net regularization?
A:
Q: Explain the ε-insensitive loss function in SVR and how the key parameters affect the
model.
A:
Q: What is k-fold cross-validation and why is it important?
A:
Q: Compare and contrast Decision Tree Regression and k-Nearest Neighbors Regression.
A:
Q: What is data leakage and how do scikit-learn Pipelines help prevent it?
A:
Q: How do you interpret learning curves to diagnose
model problems?
A:
Q: What are the main types of feature engineering
techniques and when should you use them?
A: