#research in types of fishes by physical factors
#show the data and change the a colome to the numerical

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import seaborn as sns
fishing = pd.read_csv('Fish.csv')
fishing['Species'] = fishing['Species'].map({'Bream': 1, 'Roach': 2, 'Whitefish': 3 ,'Parkki': 4, 'Perch': 5, 'Pike': 6, 'Smelt': 7})
fishing.head(10)


fishing.hist(bins=50, figsize=(20,15))
plt.show()


#show static variables
import pandas as pd  
fishing.describe()


#find relation
from pandas.plotting import scatter_matrix
import pandas as pd 
import matplotlib.pyplot as plt
fishing = pd.read_csv('Fish.csv')
reg = ["Species","Weight","Length1","Length2","Length3","Height","Width"]
scatter_matrix(fishing[reg],figsize=(15,15) )
plt.show()


import pandas as pd 
fishing=pd.read_csv('Fish.csv')
fishing.head()
corr_matrix = fishing.corr()
corr_matrix["Weight"]

Weight     1.000000
Length1    0.915712
Length2    0.918618
Length3    0.923044
Height     0.724345
Width      0.886507
Name: Weight, dtype: float64


import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
fishing = pd.read_csv('Fish.csv')
plt.figure(figsize=(10,6))
sns.heatmap(fishing.corr(),annot=True)

<AxesSubplot:>


  #filter the property by types of fishes
    import seaborn as sns
    
sns.pairplot(fishing.drop("Weight", axis=1), hue="Species", size=3)

D:\ProgramData\Anaconda3\lib\site-packages\seaborn\axisgrid.py:1912: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)

<seaborn.axisgrid.PairGrid at 0x260a429e4f0>


# use algorithms for clustering
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)  
fishing = pd.read_csv('Fish.csv')
X = fishing.drop("Species",axis=1)
y = fishing['Species']
knn=KNeighborsClassifier(n_neighbors=2,metric='minkowski',p=5)
knn.fit(X_train,y_train)
knn.score(X_train,y_train)

0.7747747747747747


from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage,dendrogram,fcluster
  
fishing = pd.read_csv('Fish.csv')
fishing.head()
X = fishing.drop("Species",axis=1)
y = fishing['Species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)

hirarachical=linkage(X,method='complete')
dendrogram(hirarachical)
plt.show()


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd 
from sklearn.model_selection import train_test_split
fishing = pd.read_csv('Fish.csv')
fishing.head()
X = fishing.drop("Species",axis=1)
y = fishing['Species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)

models={
    "LogisticRegression          ":LogisticRegression(),
    "RandomForestClassifier      ":RandomForestClassifier(),
    "GradientBoostingClassifier  ":GradientBoostingClassifier(),
    "DecisionTreeClassifier      ":DecisionTreeClassifier()
}

for name,model in models.items():
    model.fit(X_train,y_train)
    
for name,model in models.items():
    print(name + ":{:.2f}%".format(model.score(X_train,y_train)*100))

D:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

LogisticRegression          :76.58%
RandomForestClassifier      :100.00%
GradientBoostingClassifier  :100.00%
DecisionTreeClassifier      :100.00%

	Species	Weight	Length1	Length2	Length3	Height	Width
0	1	242.0	23.2	25.4	30.0	11.5200	4.0200
1	1	290.0	24.0	26.3	31.2	12.4800	4.3056
2	1	340.0	23.9	26.5	31.1	12.3778	4.6961
3	1	363.0	26.3	29.0	33.5	12.7300	4.4555
4	1	430.0	26.5	29.0	34.0	12.4440	5.1340
5	1	450.0	26.8	29.7	34.7	13.6024	4.9274
6	1	500.0	26.8	29.7	34.5	14.1795	5.2785
7	1	390.0	27.6	30.0	35.0	12.6700	4.6900
8	1	450.0	27.6	30.0	35.1	14.0049	4.8438
9	1	500.0	28.5	30.7	36.2	14.2266	4.9594

	Species	Weight	Length1	Length2	Length3	Height	Width
count	159.000000	159.000000	159.000000	159.000000	159.000000	159.000000	159.000000
mean	3.880503	398.326415	26.247170	28.415723	31.227044	8.970994	4.417486
std	2.026298	357.978317	9.996441	10.716328	11.610246	4.286208	1.685804
min	1.000000	0.000000	7.500000	8.400000	8.800000	1.728400	1.047600
25%	2.000000	120.000000	19.050000	21.000000	23.150000	5.944800	3.385650
50%	5.000000	273.000000	25.200000	27.300000	29.400000	7.786000	4.248500
75%	5.000000	650.000000	32.700000	35.500000	39.650000	12.365900	5.584500
max	7.000000	1650.000000	59.000000	63.400000	68.000000	18.957000	8.142000