#research in types of fishes by physical factors
#show the data and change the a colome to the numerical
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import seaborn as sns
fishing = pd.read_csv('Fish.csv')
fishing['Species'] = fishing['Species'].map({'Bream': 1, 'Roach': 2, 'Whitefish': 3 ,'Parkki': 4, 'Perch': 5, 'Pike': 6, 'Smelt': 7})
fishing.head(10)
Species | Weight | Length1 | Length2 | Length3 | Height | Width | |
---|---|---|---|---|---|---|---|
0 | 1 | 242.0 | 23.2 | 25.4 | 30.0 | 11.5200 | 4.0200 |
1 | 1 | 290.0 | 24.0 | 26.3 | 31.2 | 12.4800 | 4.3056 |
2 | 1 | 340.0 | 23.9 | 26.5 | 31.1 | 12.3778 | 4.6961 |
3 | 1 | 363.0 | 26.3 | 29.0 | 33.5 | 12.7300 | 4.4555 |
4 | 1 | 430.0 | 26.5 | 29.0 | 34.0 | 12.4440 | 5.1340 |
5 | 1 | 450.0 | 26.8 | 29.7 | 34.7 | 13.6024 | 4.9274 |
6 | 1 | 500.0 | 26.8 | 29.7 | 34.5 | 14.1795 | 5.2785 |
7 | 1 | 390.0 | 27.6 | 30.0 | 35.0 | 12.6700 | 4.6900 |
8 | 1 | 450.0 | 27.6 | 30.0 | 35.1 | 14.0049 | 4.8438 |
9 | 1 | 500.0 | 28.5 | 30.7 | 36.2 | 14.2266 | 4.9594 |
fishing.hist(bins=50, figsize=(20,15))
plt.show()
#show static variables
import pandas as pd
fishing.describe()
Species | Weight | Length1 | Length2 | Length3 | Height | Width | |
---|---|---|---|---|---|---|---|
count | 159.000000 | 159.000000 | 159.000000 | 159.000000 | 159.000000 | 159.000000 | 159.000000 |
mean | 3.880503 | 398.326415 | 26.247170 | 28.415723 | 31.227044 | 8.970994 | 4.417486 |
std | 2.026298 | 357.978317 | 9.996441 | 10.716328 | 11.610246 | 4.286208 | 1.685804 |
min | 1.000000 | 0.000000 | 7.500000 | 8.400000 | 8.800000 | 1.728400 | 1.047600 |
25% | 2.000000 | 120.000000 | 19.050000 | 21.000000 | 23.150000 | 5.944800 | 3.385650 |
50% | 5.000000 | 273.000000 | 25.200000 | 27.300000 | 29.400000 | 7.786000 | 4.248500 |
75% | 5.000000 | 650.000000 | 32.700000 | 35.500000 | 39.650000 | 12.365900 | 5.584500 |
max | 7.000000 | 1650.000000 | 59.000000 | 63.400000 | 68.000000 | 18.957000 | 8.142000 |
#find relation
from pandas.plotting import scatter_matrix
import pandas as pd
import matplotlib.pyplot as plt
fishing = pd.read_csv('Fish.csv')
reg = ["Species","Weight","Length1","Length2","Length3","Height","Width"]
scatter_matrix(fishing[reg],figsize=(15,15) )
plt.show()
import pandas as pd
fishing=pd.read_csv('Fish.csv')
fishing.head()
corr_matrix = fishing.corr()
corr_matrix["Weight"]
Weight 1.000000 Length1 0.915712 Length2 0.918618 Length3 0.923044 Height 0.724345 Width 0.886507 Name: Weight, dtype: float64
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
fishing = pd.read_csv('Fish.csv')
plt.figure(figsize=(10,6))
sns.heatmap(fishing.corr(),annot=True)
<AxesSubplot:>
#filter the property by types of fishes
import seaborn as sns
sns.pairplot(fishing.drop("Weight", axis=1), hue="Species", size=3)
D:\ProgramData\Anaconda3\lib\site-packages\seaborn\axisgrid.py:1912: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
<seaborn.axisgrid.PairGrid at 0x260a429e4f0>
# use algorithms for clustering
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)
fishing = pd.read_csv('Fish.csv')
X = fishing.drop("Species",axis=1)
y = fishing['Species']
knn=KNeighborsClassifier(n_neighbors=2,metric='minkowski',p=5)
knn.fit(X_train,y_train)
knn.score(X_train,y_train)
0.7747747747747747
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage,dendrogram,fcluster
fishing = pd.read_csv('Fish.csv')
fishing.head()
X = fishing.drop("Species",axis=1)
y = fishing['Species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)
hirarachical=linkage(X,method='complete')
dendrogram(hirarachical)
plt.show()
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
fishing = pd.read_csv('Fish.csv')
fishing.head()
X = fishing.drop("Species",axis=1)
y = fishing['Species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)
models={
"LogisticRegression ":LogisticRegression(),
"RandomForestClassifier ":RandomForestClassifier(),
"GradientBoostingClassifier ":GradientBoostingClassifier(),
"DecisionTreeClassifier ":DecisionTreeClassifier()
}
for name,model in models.items():
model.fit(X_train,y_train)
for name,model in models.items():
print(name + ":{:.2f}%".format(model.score(X_train,y_train)*100))
D:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result(
LogisticRegression :76.58% RandomForestClassifier :100.00% GradientBoostingClassifier :100.00% DecisionTreeClassifier :100.00%