# 1a] Naive Bayes Classifier
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
# Load data
df = pd.read_csv("spam1.csv", encoding="latin-1")[["Message", "Category"]]
df.columns = ["SMS", "Type"]
# Vectorize text data
vectorizer = CountVectorizer(stop_words="english")
X = vectorizer.fit_transform(df["SMS"])
y = df["Type"].values
# Train model
model = MultinomialNB()
model.fit(X, y)
# Predict on new messages
messages = ["Free gifts for all", "We will go for lunch"]
predictions = model.predict(vectorizer.transform(messages))
# Print predictions
for msg, pred in zip(messages, predictions):
print(f"Message: {msg} -> Prediction: {pred}")
# spam1.csv (example content, as original not fully provided)
"""
Message,Category
"Free gifts for all",Spam
"We will go for lunch",Ham
"Win a free trip now!",Spam
"Meeting at 3 PM",Ham
"""
# ----------------------------------------------------------------------------------------------
# 1b] SVM Classifier
import pandas as pd
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
# Load iris dataset
iris = datasets.load_iris()
X = iris.data[:, :2] # Use first two features for visualization
y = iris.target
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train SVM model
model = svm.SVC(kernel="linear", C=1)
model.fit(X_train, y_train)
# Predict and evaluate
predictions = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))
# Plot decision boundary
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.viridis, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.viridis, edgecolors='k')
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
plt.title("SVM Decision Boundary")
plt.show()
# No CSV file (uses sklearn.datasets.load_iris())
# ----------------------------------------------------------------------------------------------
# 3] Linear Regression
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# Load data
data = pd.read_csv("salary_data.csv")
X = data[["YearsExperience"]] # Double brackets to keep it as DataFrame
y = data["Salary"]
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# Train model
model = LinearRegression()
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Plot
plt.scatter(X_test, y_test, color='red', label="Actual Data")
plt.plot(X_train, model.predict(X_train), color='blue', label="Regression Line")
plt.xlabel("Years of Experience")
plt.ylabel("Salary")
plt.title("Linear Regression: Salary vs Experience")
plt.legend()
plt.show()
# salary_data.csv (from document)
"""
YearsExperience,Salary
1.1,39343
1.3,46205
1.5,37731
2,43525
2.2,39891
2.9,56642
3,60150
3.2,54445
3.2,64445
3.7,57189
3.9,63218
4,55794
4.1,56957
4.5,67081
,61111
"""
# ----------------------------------------------------------------------------------------------
# 4] Logistic Regression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Load data
df = pd.read_csv("admissions.csv")
X = df[["gre", "gpa", "rank"]]
y = df["admit"]
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
# Predict and evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
# admissions.csv (constructed from document)
"""
admit,gre,gpa,rank
0,380,3.61,3
1,660,3.67,3
1,800,4.0,1
1,640,3.19,4
0,520,2.93,4
1,760,3.0,2
1,560,2.98,1
0,400,3.08,2
1,540,3.39,3
0,700,3.92,2
"""
# ----------------------------------------------------------------------------------------------
# 5] Time Series
import pandas as pd
import matplotlib.pyplot as plt
# Load data
df = pd.read_csv("temperature.csv")
df["Date"] = pd.to_datetime(df["Date"], format="%d-%m-%Y")
# Plot
plt.plot(df["Date"], df["Temp"], marker='o')
plt.xlabel("Date")
plt.ylabel("Temperature")
plt.title("Temperature Time Series")
plt.xticks(rotation=45)
plt.show()
# temperature.csv (constructed from Page 6)
"""
Date,Temp
01-01-1981,20.7
02-01-1981,17.9
03-01-1981,18.8
04-01-1981,14.6
05-01-1981,15.8
06-01-1981,15.8
07-01-1981,15.8
"""
# ----------------------------------------------------------------------------------------------
# 2a] K-Means Clustering
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
# Load data
data = pd.read_csv("Mall_Customer.csv")
X = data.iloc[:, [3, 4]].values # Annual Income and Spending Score
# Train K-Means model
model = KMeans(n_clusters=5, random_state=42)
labels = model.fit_predict(X)
# Plot
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap="viridis", s=50, label="Data Points")
plt.scatter(model.cluster_centers_[:, 0], model.cluster_centers_[:, 1], c="red", marker="x", label="Centroids")
plt.xlabel("Annual Income")
plt.ylabel("Spending Score (1-100)")
plt.title("K-Means Clustering")
plt.legend()
plt.show()
# Mall_Customer.csv (partial content from document)
"""
Customer,Genre,Age,Annual Income,Spending Score (1-100)
1,Male,19,15,39
2,Male,21,15,81
3,Female,20,16,6
4,Female,23,16,77
5,Female,31,17,40
6,Female,22,17,76
7,Female,35,18,6
8,Female,23,18,94
9,Male,64,19,3
"""
# ----------------------------------------------------------------------------------------------
# 2b] Hierarchical Clustering
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
# Load data
df = pd.read_csv("wholesale_customers.csv")
X = df[["Fresh", "Milk", "Grocery", "Frozen"]].values
# Perform hierarchical clustering
Z = linkage(X, method="ward")
# Plot dendrogram
plt.figure(figsize=(10, 7))
dendrogram(Z)
plt.xlabel("Customers")
plt.ylabel("Euclidean Distance")
plt.title("Hierarchical Clustering Dendrogram")
plt.show()
# wholesale_customers.csv (constructed from document)
"""
Channel,Region,Fresh,Milk,Grocery,Frozen
2,3,12469,9656,7561,214
2,3,7657,9818,9568,1742
2,3,6353,8886,7684,2405
2,3,13265,1196,4221,6404
2,3,22615,5418,7198,319
2,3,12126,3199,6975,480
2,3,7479,4694,427,1460
"""
# ----------------------------------------------------------------------------------------------
3 updated
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# Load data
data = pd.read_csv("salary_data.csv")
X = data[["YearsExperience"]] # Double brackets to keep it as DataFrame
y = data["Salary"]
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# Train model
model = LinearRegression()
model.fit(X_train, y_train)
# Predict
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)
# Plot Test Data
plt.scatter(X_test, y_test, color='red', label="Actual Data")
plt.plot(X_train, y_pred_train, color='blue', label="Regression Line")
plt.xlabel("Years of Experience")
plt.ylabel("Salary")
plt.title("Linear Regression: Salary vs Experience (Test Dataset)")
plt.legend()
plt.show()
# Plot Training Data
plt.scatter(X_train, y_train, color="green", label="Training Data")
plt.plot(X_train, y_pred_train, color="red", label="Regression Line")
plt.title("Salary vs Experience (Training Dataset)")
plt.xlabel("Years of Experience")
plt.ylabel("Salary")
plt.legend()
plt.show()
Comments
Post a Comment