In [1]:
#research in types of fishes by physical factors
#show the data and change the a colome to the numerical

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import seaborn as sns
fishing = pd.read_csv('Fish.csv')
fishing['Species'] = fishing['Species'].map({'Bream': 1, 'Roach': 2, 'Whitefish': 3 ,'Parkki': 4, 'Perch': 5, 'Pike': 6, 'Smelt': 7})
fishing.head(10)
    
Out[1]:
Species Weight Length1 Length2 Length3 Height Width
0 1 242.0 23.2 25.4 30.0 11.5200 4.0200
1 1 290.0 24.0 26.3 31.2 12.4800 4.3056
2 1 340.0 23.9 26.5 31.1 12.3778 4.6961
3 1 363.0 26.3 29.0 33.5 12.7300 4.4555
4 1 430.0 26.5 29.0 34.0 12.4440 5.1340
5 1 450.0 26.8 29.7 34.7 13.6024 4.9274
6 1 500.0 26.8 29.7 34.5 14.1795 5.2785
7 1 390.0 27.6 30.0 35.0 12.6700 4.6900
8 1 450.0 27.6 30.0 35.1 14.0049 4.8438
9 1 500.0 28.5 30.7 36.2 14.2266 4.9594
In [2]:
fishing.hist(bins=50, figsize=(20,15))
plt.show()
In [3]:
#show static variables
import pandas as pd  
fishing.describe()
Out[3]:
Species Weight Length1 Length2 Length3 Height Width
count 159.000000 159.000000 159.000000 159.000000 159.000000 159.000000 159.000000
mean 3.880503 398.326415 26.247170 28.415723 31.227044 8.970994 4.417486
std 2.026298 357.978317 9.996441 10.716328 11.610246 4.286208 1.685804
min 1.000000 0.000000 7.500000 8.400000 8.800000 1.728400 1.047600
25% 2.000000 120.000000 19.050000 21.000000 23.150000 5.944800 3.385650
50% 5.000000 273.000000 25.200000 27.300000 29.400000 7.786000 4.248500
75% 5.000000 650.000000 32.700000 35.500000 39.650000 12.365900 5.584500
max 7.000000 1650.000000 59.000000 63.400000 68.000000 18.957000 8.142000
In [24]:
#find relation
from pandas.plotting import scatter_matrix
import pandas as pd 
import matplotlib.pyplot as plt
fishing = pd.read_csv('Fish.csv')
reg = ["Species","Weight","Length1","Length2","Length3","Height","Width"]
scatter_matrix(fishing[reg],figsize=(15,15) )
plt.show()
In [2]:
import pandas as pd 
fishing=pd.read_csv('Fish.csv')
fishing.head()
corr_matrix = fishing.corr()
corr_matrix["Weight"]
Out[2]:
Weight     1.000000
Length1    0.915712
Length2    0.918618
Length3    0.923044
Height     0.724345
Width      0.886507
Name: Weight, dtype: float64
In [3]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
fishing = pd.read_csv('Fish.csv')
plt.figure(figsize=(10,6))
sns.heatmap(fishing.corr(),annot=True)
Out[3]:
<AxesSubplot:>
In [8]:
  #filter the property by types of fishes
    import seaborn as sns
    
sns.pairplot(fishing.drop("Weight", axis=1), hue="Species", size=3)
D:\ProgramData\Anaconda3\lib\site-packages\seaborn\axisgrid.py:1912: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)
Out[8]:
<seaborn.axisgrid.PairGrid at 0x260a429e4f0>
In [5]:
# use algorithms for clustering
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)  
fishing = pd.read_csv('Fish.csv')
X = fishing.drop("Species",axis=1)
y = fishing['Species']
knn=KNeighborsClassifier(n_neighbors=2,metric='minkowski',p=5)
knn.fit(X_train,y_train)
knn.score(X_train,y_train)
Out[5]:
0.7747747747747747
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [10]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage,dendrogram,fcluster
  
fishing = pd.read_csv('Fish.csv')
fishing.head()
X = fishing.drop("Species",axis=1)
y = fishing['Species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)

hirarachical=linkage(X,method='complete')
dendrogram(hirarachical)
plt.show()
In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd 
from sklearn.model_selection import train_test_split
fishing = pd.read_csv('Fish.csv')
fishing.head()
X = fishing.drop("Species",axis=1)
y = fishing['Species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)

models={
    "LogisticRegression          ":LogisticRegression(),
    "RandomForestClassifier      ":RandomForestClassifier(),
    "GradientBoostingClassifier  ":GradientBoostingClassifier(),
    "DecisionTreeClassifier      ":DecisionTreeClassifier()
}

for name,model in models.items():
    model.fit(X_train,y_train)
    
for name,model in models.items():
    print(name + ":{:.2f}%".format(model.score(X_train,y_train)*100))
D:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
LogisticRegression          :76.58%
RandomForestClassifier      :100.00%
GradientBoostingClassifier  :100.00%
DecisionTreeClassifier      :100.00%
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: