DS Lab Practical

DS lab manual 


Practical No.1 DATA WRANGLING I


import pandas as pd

import numpy as np

import seaborn as sns

df = sns.load_dataset('iris')

print(df.head())

print(df.shape)

print(df.isnull().sum())

print(df.describe())

df.info()

print(df.dtypes)

numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

df[numeric_cols] = (df[numeric_cols] - df[numeric_cols].min()) / (df[numeric_cols].max() - df[numeric_cols].min())

print(df.head())

df_encoded = pd.get_dummies(df, drop_first=True)

print(df_encoded.head())


Practical No.2 DATA WRANGLING II


import pandas as pd

import numpy as np

df = pd.DataFrame({

'Name': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'],

'Age': [20, 21, np.nan, 22, 20, 23, 21, 50],

'Study_Hours': [2, 3, 4, np.nan, 5, 6, 2, 20],

'Attendance': [80, 85, 90, 88, np.nan, 92, 87, 95],

'Marks': [60, 65, 70, 75, 80, 85, 90, 200]

})

print(df)

print(df.isnull().sum())

df['Age'].fillna(df['Age'].mean(), inplace=True)

df['Study_Hours'].fillna(df['Study_Hours'].mean(), inplace=True)

df['Attendance'].fillna(df['Attendance'].mean(), inplace=True)

print(df)

Q1 = df['Marks'].quantile(0.25)

Q3 = df['Marks'].quantile(0.75)

IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR

upper = Q3 + 1.5 * IQR

df['Marks'] = np.where(df['Marks'] > upper, upper, df['Marks'])

df['Marks'] = np.where(df['Marks'] < lower, lower, df['Marks'])

print(df)

df['Log_Marks'] = np.log(df['Marks'])

print(df)


Practical No.3 DESCRIPTIVE STATISTICS


import pandas as pd

import seaborn as sns

df1 = sns.load_dataset('tips')

print(df1.head())

grouped_stats = df1.groupby('day')['total_bill'].agg(['mean', 'median', 'min', 'max', 'std'])

print(grouped_stats)

group_list = df1.groupby('day')['total_bill'].apply(list)

print(group_list)

df2 = sns.load_dataset('iris')

setosa = df2[df2['species'] == 'setosa']

versicolor = df2[df2['species'] == 'versicolor']

virginica = df2[df2['species'] == 'virginica']

print(setosa.describe())

print(versicolor.describe())

print(virginica.describe())

print(setosa.select_dtypes(include=['float64','int64']).quantile([0.25, 0.5, 0.75]))

print(versicolor.select_dtypes(include=['float64','int64']).quantile([0.25, 0.5, 0.75]))

print(virginica.select_dtypes(include=['float64','int64']).quantile([0.25, 0.5, 0.75]))


Practical No.4 DATA ANALYTICS I


import pandas as pd

import numpy as np

from sklearn.datasets import fetch_openml

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score

boston = fetch_openml(name='boston', version=1, as_frame=True)

df = boston.frame

print(df.head())

print(df.shape)

X = df.drop('MEDV', axis=1)

y = df['MEDV']

X = X.apply(pd.to_numeric, errors='coerce')

X.fillna(X.mean(), inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("MSE:", mean_squared_error(y_test, y_pred))

print("R2 Score:", r2_score(y_test, y_pred))

print("Predicted Prices:", y_pred[:10])


Practical No.5 DATA ANALYTICS II


import pandas as pd

import numpy as np

from sklearn.datasets import make_classification

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix

X, y = make_classification(n_samples=400, n_features=2, n_redundant=0, n_informative=2, random_state=42)

df = pd.DataFrame(X, columns=['Age', 'EstimatedSalary'])

df['Purchased'] = y

X = df[['Age', 'EstimatedSalary']]

y = df['Purchased']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

TN, FP, FN, TP = cm.ravel()

accuracy = (TP + TN) / (TP + TN + FP + FN)

error_rate = (FP + FN) / (TP + TN + FP + FN)

precision = TP / (TP + FP)

recall = TP / (TP + FN)

print("Confusion Matrix:\n", cm)

print("TP:", TP, "FP:", FP, "TN:", TN, "FN:", FN)

print("Accuracy:", accuracy)

print("Error Rate:", error_rate)

print("Precision:", precision)

print("Recall:", recall)


Practical No.6 DATA ANALYTICS III


import pandas as pd

import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import confusion_matrix

df = sns.load_dataset('iris')

df['species'] = df['species'].map({'setosa':0, 'versicolor':1, 'virginica':2})

X = df.drop('species', axis=1)

y = df['species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = GaussianNB()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

TP = cm[1,1]

FP = cm[0,1]

TN = cm[0,0]

FN = cm[1,0]

accuracy = (cm.diagonal().sum()) / cm.sum()

error_rate = 1 - accuracy

precision = TP / (TP + FP) if (TP + FP) != 0 else 0

recall = TP / (TP + FN) if (TP + FN) != 0 else 0

print("Confusion Matrix:\n", cm)

print("TP:", TP, "FP:", FP, "TN:", TN, "FN:", FN)

print("Accuracy:", accuracy)

print("Error Rate:", error_rate)

print("Precision:", precision)

print("Recall:", recall)


Practical No.7 TEXT ANALYTICS


import nltk

from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords

from nltk.stem import PorterStemmer, WordNetLemmatizer

from nltk import pos_tag

from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')

nltk.download('punkt_tab')

nltk.download('stopwords')

nltk.download('wordnet')

nltk.download('averaged_perceptron_tagger')

nltk.download('averaged_perceptron_tagger_eng')

text = "Natural language processing is a field of artificial intelligence that focuses on interaction between computers and humans."

tokens = word_tokenize(text)

print("Tokens:", tokens)

pos = pos_tag(tokens)

print("POS Tags:", pos)

stop_words = set(stopwords.words('english'))

filtered_tokens = [w for w in tokens if w.lower() not in stop_words]

print("After Stopword Removal:", filtered_tokens)

ps = PorterStemmer()

stemmed = [ps.stem(w) for w in filtered_tokens]

print("Stemmed Words:", stemmed)

lemmatizer = WordNetLemmatizer()

lemmatized = [lemmatizer.lemmatize(w) for w in filtered_tokens]

print("Lemmatized Words:", lemmatized)

documents = [

"Natural language processing enables computers to understand text",

"Machine learning improves natural language processing",

"Text analytics involves processing of text data"

]

vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(documents)

print("TF-IDF Feature Names:", vectorizer.get_feature_names_out())

print("TF-IDF Matrix:\n", tfidf.toarray())


Practical No.8 DATA VISUALIZATION I


import seaborn as sns

import matplotlib.pyplot as plt

df = sns.load_dataset('titanic')

print(df.head())

print(df.shape)

print(df.info())

sns.countplot(x='survived', data=df)

plt.show()

sns.countplot(x='sex', hue='survived', data=df)

plt.show()

sns.countplot(x='class', hue='survived', data=df)

plt.show()

sns.histplot(df['fare'], bins=30, kde=True)

plt.show()


Practical No.9 DATA VISUALIZATION II


import seaborn as sns

import matplotlib.pyplot as plt

df = sns.load_dataset('titanic')

sns.boxplot(x='sex', y='age', hue='survived', data=df)

plt.show()


Practical No.10 DATA VISUALIZATION III


import seaborn as sns

import matplotlib.pyplot as plt

df = sns.load_dataset('iris')

print(df.dtypes)

df.hist(figsize=(10,8))

plt.show()

for col in df.select_dtypes(include=['float64','int64']).columns:

    sns.boxplot(x=df[col])

    plt.title(col)

    plt.show()


Practical No.11 DATABASE QUERYING (run it on mysql OR online sql compiler)


CREATE DATABASE college_db;

USE college_db;

CREATE TABLE student (

id INT,

name STRING,

department STRING,

marks INT

);

INSERT INTO student VALUES

(1, 'Shruti', 'AI&DS', 89),

(2, 'Supriya', 'CSE', 76),

(3, 'Hrucha', 'IT', 91),

(4, 'Yash', 'AIML', 85);

SELECT * FROM student;


Practical No.12 SCALA PROGRAMMING (run it on apache spark OR databricks)


// Scala Spark Word Count Example

import org.apache.spark.sql.SparkSession

object WordCountExample {

def main(args: Array[String]): Unit = {

val spark = SparkSession.builder()

.appName("Word Count Example")

.master("local[*]")

.getOrCreate()

val sc = spark.sparkContext

val data = List("Hello Spark", "Hello Scala", "Apache Spark")

val rdd = sc.parallelize(data)

val wordCount = rdd.flatMap(line => line.split(" "))

.map(word => (word, 1))

.reduceByKey(_ + _)

wordCount.collect().foreach(println)

spark.stop()

}

}


Comments

Popular posts from this blog

SL-1 lab practical

CN Lab Manual

TE ANN Practical