# 1a] Naive Bayes Classifier

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB


# Load data

df = pd.read_csv("spam1.csv", encoding="latin-1")[["Message", "Category"]]

df.columns = ["SMS", "Type"]


# Vectorize text data

vectorizer = CountVectorizer(stop_words="english")

X = vectorizer.fit_transform(df["SMS"])

y = df["Type"].values


# Train model

model = MultinomialNB()

model.fit(X, y)


# Predict on new messages

messages = ["Free gifts for all", "We will go for lunch"]

predictions = model.predict(vectorizer.transform(messages))


# Print predictions

for msg, pred in zip(messages, predictions):

    print(f"Message: {msg} -> Prediction: {pred}")


# spam1.csv (example content, as original not fully provided)

"""

Message,Category

"Free gifts for all",Spam

"We will go for lunch",Ham

"Win a free trip now!",Spam

"Meeting at 3 PM",Ham

"""


# ----------------------------------------------------------------------------------------------


# 1b] SVM Classifier

import pandas as pd

from sklearn import svm, datasets

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

import numpy as np

import matplotlib.pyplot as plt


# Load iris dataset

iris = datasets.load_iris()

X = iris.data[:, :2]  # Use first two features for visualization

y = iris.target


# Split data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Train SVM model

model = svm.SVC(kernel="linear", C=1)

model.fit(X_train, y_train)


# Predict and evaluate

predictions = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, predictions))


# Plot decision boundary

h = 0.02

x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1

y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

Z = model.predict(np.c_[xx.ravel(), yy.ravel()])

Z = Z.reshape(xx.shape)


plt.contourf(xx, yy, Z, cmap=plt.cm.viridis, alpha=0.8)

plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.viridis, edgecolors='k')

plt.xlabel(iris.feature_names[0])

plt.ylabel(iris.feature_names[1])

plt.title("SVM Decision Boundary")

plt.show()


# No CSV file (uses sklearn.datasets.load_iris())


# ----------------------------------------------------------------------------------------------


# 3] Linear Regression

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression


# Load data

data = pd.read_csv("salary_data.csv")

X = data[["YearsExperience"]]  # Double brackets to keep it as DataFrame

y = data["Salary"]


# Split data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


# Train model

model = LinearRegression()

model.fit(X_train, y_train)


# Predict

y_pred = model.predict(X_test)


# Plot

plt.scatter(X_test, y_test, color='red', label="Actual Data")

plt.plot(X_train, model.predict(X_train), color='blue', label="Regression Line")

plt.xlabel("Years of Experience")

plt.ylabel("Salary")

plt.title("Linear Regression: Salary vs Experience")

plt.legend()

plt.show()


# salary_data.csv (from document)

"""

YearsExperience,Salary

1.1,39343

1.3,46205

1.5,37731

2,43525

2.2,39891

2.9,56642

3,60150

3.2,54445

3.2,64445

3.7,57189

3.9,63218

4,55794

4.1,56957

4.5,67081

,61111

"""


# ----------------------------------------------------------------------------------------------


# 4] Logistic Regression

import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score


# Load data

df = pd.read_csv("admissions.csv")

X = df[["gre", "gpa", "rank"]]

y = df["admit"]


# Split data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Train model

model = LogisticRegression(max_iter=1000)

model.fit(X_train, y_train)


# Predict and evaluate

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))


# admissions.csv (constructed from document)

"""

admit,gre,gpa,rank

0,380,3.61,3

1,660,3.67,3

1,800,4.0,1

1,640,3.19,4

0,520,2.93,4

1,760,3.0,2

1,560,2.98,1

0,400,3.08,2

1,540,3.39,3

0,700,3.92,2

"""


# ----------------------------------------------------------------------------------------------


# 5] Time Series

import pandas as pd

import matplotlib.pyplot as plt


# Load data

df = pd.read_csv("temperature.csv")

df["Date"] = pd.to_datetime(df["Date"], format="%d-%m-%Y")


# Plot

plt.plot(df["Date"], df["Temp"], marker='o')

plt.xlabel("Date")

plt.ylabel("Temperature")

plt.title("Temperature Time Series")

plt.xticks(rotation=45)

plt.show()


# temperature.csv (constructed from Page 6)

"""

Date,Temp

01-01-1981,20.7

02-01-1981,17.9

03-01-1981,18.8

04-01-1981,14.6

05-01-1981,15.8

06-01-1981,15.8

07-01-1981,15.8

"""


# ----------------------------------------------------------------------------------------------


# 2a] K-Means Clustering

import pandas as pd

from sklearn.cluster import KMeans

import matplotlib.pyplot as plt


# Load data

data = pd.read_csv("Mall_Customer.csv")

X = data.iloc[:, [3, 4]].values  # Annual Income and Spending Score


# Train K-Means model

model = KMeans(n_clusters=5, random_state=42)

labels = model.fit_predict(X)


# Plot

plt.scatter(X[:, 0], X[:, 1], c=labels, cmap="viridis", s=50, label="Data Points")

plt.scatter(model.cluster_centers_[:, 0], model.cluster_centers_[:, 1], c="red", marker="x", label="Centroids")

plt.xlabel("Annual Income")

plt.ylabel("Spending Score (1-100)")

plt.title("K-Means Clustering")

plt.legend()

plt.show()


# Mall_Customer.csv (partial content from document)

"""

Customer,Genre,Age,Annual Income,Spending Score (1-100)

1,Male,19,15,39

2,Male,21,15,81

3,Female,20,16,6

4,Female,23,16,77

5,Female,31,17,40

6,Female,22,17,76

7,Female,35,18,6

8,Female,23,18,94

9,Male,64,19,3

"""


# ----------------------------------------------------------------------------------------------


# 2b] Hierarchical Clustering

import pandas as pd

import matplotlib.pyplot as plt

from scipy.cluster.hierarchy import dendrogram, linkage


# Load data

df = pd.read_csv("wholesale_customers.csv")

X = df[["Fresh", "Milk", "Grocery", "Frozen"]].values


# Perform hierarchical clustering

Z = linkage(X, method="ward")


# Plot dendrogram

plt.figure(figsize=(10, 7))

dendrogram(Z)

plt.xlabel("Customers")

plt.ylabel("Euclidean Distance")

plt.title("Hierarchical Clustering Dendrogram")

plt.show()


# wholesale_customers.csv (constructed from document)

"""

Channel,Region,Fresh,Milk,Grocery,Frozen

2,3,12469,9656,7561,214

2,3,7657,9818,9568,1742

2,3,6353,8886,7684,2405

2,3,13265,1196,4221,6404

2,3,22615,5418,7198,319

2,3,12126,3199,6975,480

2,3,7479,4694,427,1460

"""


# ----------------------------------------------------------------------------------------------





3 updated


import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression


# Load data

data = pd.read_csv("salary_data.csv")


X = data[["YearsExperience"]]  # Double brackets to keep it as DataFrame

y = data["Salary"]


# Split data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


# Train model

model = LinearRegression()

model.fit(X_train, y_train)


# Predict

y_pred_test = model.predict(X_test)

y_pred_train = model.predict(X_train)


# Plot Test Data

plt.scatter(X_test, y_test, color='red', label="Actual Data")

plt.plot(X_train, y_pred_train, color='blue', label="Regression Line")

plt.xlabel("Years of Experience")

plt.ylabel("Salary")

plt.title("Linear Regression: Salary vs Experience (Test Dataset)")

plt.legend()

plt.show()


# Plot Training Data

plt.scatter(X_train, y_train, color="green", label="Training Data")

plt.plot(X_train, y_pred_train, color="red", label="Regression Line")

plt.title("Salary vs Experience (Training Dataset)")

plt.xlabel("Years of Experience")

plt.ylabel("Salary")

plt.legend()

plt.show()


Comments