DS Lab Practical
Practical No.1 DATA WRANGLING I
import pandas as pd
import numpy as np
import seaborn as sns
df = sns.load_dataset('iris')
print(df.head())
print(df.shape)
print(df.isnull().sum())
print(df.describe())
df.info()
print(df.dtypes)
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numeric_cols] = (df[numeric_cols] - df[numeric_cols].min()) / (df[numeric_cols].max() - df[numeric_cols].min())
print(df.head())
df_encoded = pd.get_dummies(df, drop_first=True)
print(df_encoded.head())
Practical No.2 DATA WRANGLING II
import pandas as pd
import numpy as np
df = pd.DataFrame({
'Name': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'],
'Age': [20, 21, np.nan, 22, 20, 23, 21, 50],
'Study_Hours': [2, 3, 4, np.nan, 5, 6, 2, 20],
'Attendance': [80, 85, 90, 88, np.nan, 92, 87, 95],
'Marks': [60, 65, 70, 75, 80, 85, 90, 200]
})
print(df)
print(df.isnull().sum())
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Study_Hours'].fillna(df['Study_Hours'].mean(), inplace=True)
df['Attendance'].fillna(df['Attendance'].mean(), inplace=True)
print(df)
Q1 = df['Marks'].quantile(0.25)
Q3 = df['Marks'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
df['Marks'] = np.where(df['Marks'] > upper, upper, df['Marks'])
df['Marks'] = np.where(df['Marks'] < lower, lower, df['Marks'])
print(df)
df['Log_Marks'] = np.log(df['Marks'])
print(df)
Practical No.3 DESCRIPTIVE STATISTICS
import pandas as pd
import seaborn as sns
df1 = sns.load_dataset('tips')
print(df1.head())
grouped_stats = df1.groupby('day')['total_bill'].agg(['mean', 'median', 'min', 'max', 'std'])
print(grouped_stats)
group_list = df1.groupby('day')['total_bill'].apply(list)
print(group_list)
df2 = sns.load_dataset('iris')
setosa = df2[df2['species'] == 'setosa']
versicolor = df2[df2['species'] == 'versicolor']
virginica = df2[df2['species'] == 'virginica']
print(setosa.describe())
print(versicolor.describe())
print(virginica.describe())
print(setosa.select_dtypes(include=['float64','int64']).quantile([0.25, 0.5, 0.75]))
print(versicolor.select_dtypes(include=['float64','int64']).quantile([0.25, 0.5, 0.75]))
print(virginica.select_dtypes(include=['float64','int64']).quantile([0.25, 0.5, 0.75]))
Practical No.4 DATA ANALYTICS I
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
boston = fetch_openml(name='boston', version=1, as_frame=True)
df = boston.frame
print(df.head())
print(df.shape)
X = df.drop('MEDV', axis=1)
y = df['MEDV']
X = X.apply(pd.to_numeric, errors='coerce')
X.fillna(X.mean(), inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))
print("Predicted Prices:", y_pred[:10])
Practical No.5 DATA ANALYTICS II
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
X, y = make_classification(n_samples=400, n_features=2, n_redundant=0, n_informative=2, random_state=42)
df = pd.DataFrame(X, columns=['Age', 'EstimatedSalary'])
df['Purchased'] = y
X = df[['Age', 'EstimatedSalary']]
y = df['Purchased']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = cm.ravel()
accuracy = (TP + TN) / (TP + TN + FP + FN)
error_rate = (FP + FN) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print("Confusion Matrix:\n", cm)
print("TP:", TP, "FP:", FP, "TN:", TN, "FN:", FN)
print("Accuracy:", accuracy)
print("Error Rate:", error_rate)
print("Precision:", precision)
print("Recall:", recall)
Practical No.6 DATA ANALYTICS III
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
df = sns.load_dataset('iris')
df['species'] = df['species'].map({'setosa':0, 'versicolor':1, 'virginica':2})
X = df.drop('species', axis=1)
y = df['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
TP = cm[1,1]
FP = cm[0,1]
TN = cm[0,0]
FN = cm[1,0]
accuracy = (cm.diagonal().sum()) / cm.sum()
error_rate = 1 - accuracy
precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0
print("Confusion Matrix:\n", cm)
print("TP:", TP, "FP:", FP, "TN:", TN, "FN:", FN)
print("Accuracy:", accuracy)
print("Error Rate:", error_rate)
print("Precision:", precision)
print("Recall:", recall)
Practical No.7 TEXT ANALYTICS
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
text = "Natural language processing is a field of artificial intelligence that focuses on interaction between computers and humans."
tokens = word_tokenize(text)
print("Tokens:", tokens)
pos = pos_tag(tokens)
print("POS Tags:", pos)
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in tokens if w.lower() not in stop_words]
print("After Stopword Removal:", filtered_tokens)
ps = PorterStemmer()
stemmed = [ps.stem(w) for w in filtered_tokens]
print("Stemmed Words:", stemmed)
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w) for w in filtered_tokens]
print("Lemmatized Words:", lemmatized)
documents = [
"Natural language processing enables computers to understand text",
"Machine learning improves natural language processing",
"Text analytics involves processing of text data"
]
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(documents)
print("TF-IDF Feature Names:", vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", tfidf.toarray())
Practical No.8 DATA VISUALIZATION I
import seaborn as sns
import matplotlib.pyplot as plt
df = sns.load_dataset('titanic')
print(df.head())
print(df.shape)
print(df.info())
sns.countplot(x='survived', data=df)
plt.show()
sns.countplot(x='sex', hue='survived', data=df)
plt.show()
sns.countplot(x='class', hue='survived', data=df)
plt.show()
sns.histplot(df['fare'], bins=30, kde=True)
plt.show()
Practical No.9 DATA VISUALIZATION II
import seaborn as sns
import matplotlib.pyplot as plt
df = sns.load_dataset('titanic')
sns.boxplot(x='sex', y='age', hue='survived', data=df)
plt.show()
Practical No.10 DATA VISUALIZATION III
import seaborn as sns
import matplotlib.pyplot as plt
df = sns.load_dataset('iris')
print(df.dtypes)
df.hist(figsize=(10,8))
plt.show()
for col in df.select_dtypes(include=['float64','int64']).columns:
sns.boxplot(x=df[col])
plt.title(col)
plt.show()
Practical No.11 DATABASE QUERYING (run it on mysql OR online sql compiler)
CREATE DATABASE college_db;
USE college_db;
CREATE TABLE student (
id INT,
name STRING,
department STRING,
marks INT
);
INSERT INTO student VALUES
(1, 'Shruti', 'AI&DS', 89),
(2, 'Supriya', 'CSE', 76),
(3, 'Hrucha', 'IT', 91),
(4, 'Yash', 'AIML', 85);
SELECT * FROM student;
Practical No.12 SCALA PROGRAMMING (run it on apache spark OR databricks)
// Scala Spark Word Count Example
import org.apache.spark.sql.SparkSession
object WordCountExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("Word Count Example")
.master("local[*]")
.getOrCreate()
val sc = spark.sparkContext
val data = List("Hello Spark", "Hello Scala", "Apache Spark")
val rdd = sc.parallelize(data)
val wordCount = rdd.flatMap(line => line.split(" "))
.map(word => (word, 1))
.reduceByKey(_ + _)
wordCount.collect().foreach(println)
spark.stop()
}
}
Comments
Post a Comment