GitLab wurde erfolgreich aktualisiert. Durch regelmäßige Updates bleibt das THM GitLab sicher. Danke für Ihre Geduld.

Commit 231d7279 authored by Jens Plüddemann's avatar Jens Plüddemann

added preprocessing lecture

parent f18fc5a0
......@@ -2,7 +2,7 @@
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="jdk" jdkName="Pipenv (predictive-analytics)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
......
......@@ -14,6 +14,7 @@ keras = "*"
tensorflow = "*"
statsmodels = "*"
xlrd = "*"
missingno = "*"
[requires]
python_version = "3.7"
{
"_meta": {
"hash": {
"sha256": "1f625db39df85372f27e3da776cee5c02bf67db700253d3b9553f5bc48679707"
"sha256": "d6c51b12d8644c79e22d068086ff2ef5e1196e314711aeedcaaa1148bae9cba3"
},
"pipfile-spec": 6,
"requires": {
......@@ -271,6 +271,14 @@
"index": "pypi",
"version": "==3.1.2"
},
"missingno": {
"hashes": [
"sha256:02eb92085e4efa0d4c06239750ac44a8d6cc5979bf5b954a09c47b648413ff41",
"sha256:a7d13b36cbb7b422b8b504d9661b23e0db99833ce8b60f9e0e5c8bdab2ccb857"
],
"index": "pypi",
"version": "==0.4.2"
},
"numpy": {
"hashes": [
"sha256:1786a08236f2c92ae0e70423c45e1e62788ed33028f94ca99c4df03f5be6b3c6",
......@@ -492,6 +500,13 @@
"markers": "python_version >= '3'",
"version": "==1.4.1"
},
"seaborn": {
"hashes": [
"sha256:59fe414e138d7d5ea08b0feb01b86caf4682e36fa748e3987730523a89aecbb9",
"sha256:bdf7714ef7d4603e6325d3902e80a46d6149561e1cc237ac08a1c05c3f55a996"
],
"version": "==0.10.0"
},
"six": {
"hashes": [
"sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a",
......@@ -571,18 +586,18 @@
},
"werkzeug": {
"hashes": [
"sha256:7280924747b5733b246fe23972186c6b348f9ae29724135a6dfc1e53cea433e7",
"sha256:e5f4a1f98b52b18a93da705a7458e55afb26f32bff83ff5d19189f92462d65c4"
"sha256:1e0dedc2acb1f46827daa2e399c1485c8fa17c0d8e70b6b875b4e7f54bf408d2",
"sha256:b353856d37dec59d6511359f97f6a4b2468442e454bd1c98298ddce53cac1f04"
],
"version": "==0.16.0"
"version": "==0.16.1"
},
"wheel": {
"hashes": [
"sha256:10c9da68765315ed98850f8e048347c3eb06dd81822dc2ab1d4fde9dc9702646",
"sha256:f4da1763d3becf2e2cd92a14a7c920f0f00eca30fdde9ea992c836685b9faf28"
"sha256:48e082fac9a549bb30abcb71360db41e9e999f63bfc9933fdb7339ba7205330f",
"sha256:664b9c5033ee7cd5aa6b355bc8a4a5915eadb7612e7b0acab1aa71f005457107"
],
"markers": "python_version >= '3'",
"version": "==0.33.6"
"version": "==0.34.1"
},
"wrapt": {
"hashes": [
......
YearsExperience,Salary
1.1,39343.00
1.3,46205.00
1.5,37731.00
2.0,43525.00
2.2,39891.00
2.9,56642.00
3.0,60150.00
3.2,54445.00
3.2,64445.00
3.7,57189.00
3.9,63218.00
4.0,55794.00
4.0,56957.00
4.1,57081.00
4.5,61111.00
4.9,67938.00
5.1,66029.00
5.3,83088.00
5.9,81363.00
6.0,93940.00
6.8,91738.00
7.1,98273.00
7.9,101302.00
8.2,113812.00
8.7,109431.00
9.0,105582.00
9.5,116969.00
9.6,112635.00
10.3,122391.00
10.5,121872.00
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
class OneHotEncoding:
def __init__(self):
self.data = np.array([
'Katze',
'Hund',
'Hamster',
'Hund',
'Katze',
'Hund'
])
def one_hot_encoder(self):
# label encoding not required anymore.. just for demo purpose
label_encoder = LabelEncoder()
data_label_encoded = label_encoder.fit_transform(self.data)
print(data_label_encoded)
one_hot_encoder = OneHotEncoder(sparse=False)
data_one_hot_encoded = one_hot_encoder.fit_transform(self.data.reshape(-1, 1))
print(data_one_hot_encoded)
if __name__ == '__main__':
data_set = OneHotEncoding()
data_set.one_hot_encoder()
import pandas as pd
import missingno as mn
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import SimpleImputer
class PropertyData:
def __init__(self, file_path: str):
self.file_path = file_path
self.df = pd.read_csv(self.file_path)
def look_at_the_data(self):
print(self.df.head())
print(self.df.to_string())
def find_missing_values_matrix(self):
mn.matrix(self.df)
plt.show()
def find_missing_values_bar(self):
mn.bar(self.df)
plt.show()
def check_each_column(self):
for column in self.df.columns:
print('\n' + column)
print(self.df[column].to_string())
def fix_missing_data(self):
self.df = pd.read_csv(self.file_path, na_values=['na', '--'])
self.df['OWN_OCCUPIED'] = self.df['OWN_OCCUPIED'] \
.replace(['Y', 'N'], [True, False]) \
.apply(lambda value: value if type(value) is bool else np.nan)
self.df['NUM_BATH'] = pd.to_numeric(self.df['NUM_BATH'], errors='coerce')
self.df['NUM_BATH'] = self.df['NUM_BATH'] \
.apply(lambda value: value if np.isnan(value) else int(np.floor(value)))
def list_wise_deletion(self):
self.df.dropna()
def mean_imputation(self):
imputer = SimpleImputer()
to_impute = ['ST_NUM', 'NUM_BEDROOMS', 'NUM_BATH', 'SQ_FT']
for impute in to_impute:
self.df[impute] = imputer.fit_transform(self.df[impute].to_numpy().reshape(-1, 1))
def most_frequent_imputation(self):
self.df['OWN_OCCUPIED'] = SimpleImputer(strategy='most_frequent').fit_transform(
self.df['OWN_OCCUPIED'].to_numpy().reshape(-1, 1))
def final_result(self):
self.fix_missing_data()
self.mean_imputation()
self.most_frequent_imputation()
if __name__ == '__main__':
data_set = PropertyData('../../res/propertydata.csv')
# data_set.look_at_the_data()
# data_set.find_missing_values_matrix()
# data_set.find_missing_values_bar()
# data_set.check_each_column()
data_set.final_result()
import numpy as np
from sklearn.preprocessing import scale, MinMaxScaler
class ScalingData:
def __init__(self):
self.data = np.array([
[1, -10, 0],
[20, 0, 0.3],
[16, 10, 0.9]
])
def scale_data(self):
scaled_data = scale(self.data)
print(scaled_data)
def min_max_scale_data(self):
min_max_scaled_data = MinMaxScaler().fit_transform(self.data)
print(min_max_scaled_data)
if __name__ == '__main__':
data_set = ScalingData()
data_set.scale_data()
data_set.min_max_scale_data()
import numpy as np
from sklearn.model_selection import train_test_split
class SplittingData:
def __init__(self):
self.x, self.y = np.arange(10).reshape((5, 2)), list(range(5))
print(self.x)
print(self.y)
def splitting_data(self):
X_train, X_test, y_train, y_test = train_test_split(self.x, self.y, test_size=0.33, random_state=42)
print('\nSplitted data')
print(X_train)
print(X_test)
print(y_train)
print(y_test)
if __name__ == '__main__':
data_set = SplittingData()
data_set.splitting_data()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment