Linking the Datasets¶
Once we concluded the initial exploration of our datasets, and finally decided that we would focus on sardines, we looked for ways to relate our datasets to each other. After all, we had such a plethora of data, that it was natural to try to get the most out of it and stop treating each dataset as an independent entity.
Relationship Between Sardine Larvae and Sardine¶
import plotly.express as px
import pandas as pd
import numpy as np
import scipy
from scipy.stats import pearsonr
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from scipy import stats
import plotly.io as pio
pd.options.mode.chained_assignment = None
#loading dataset
sardine_data = pd.read_csv("data/sardine_data.csv")
sardine_data2 = pd.read_csv("data/lagged_sardine_data.csv")
sardine_data = sardine_data.rename(columns={"Sardine Larvae lbs": "Count"})
sardine_data['Count'] = sardine_data['Count'].round(0)
sardine_data2 = sardine_data2.rename(columns={"Sardine Larvae lbs": "Count"})
sardine_data2['Count'] = sardine_data2['Count'].round(0)
#linear regression model
X = sardine_data['CatchLbs'].values.reshape(-1,1)
Y = sardine_data['Count'].values.reshape(-1,1)
linear_regressor = LinearRegression()
linear_regressor.fit(X, Y)
Y_pred = linear_regressor.predict(X)
Y = np.array(Y).reshape(-1,)
X = np.array(X).reshape(-1,)
fig = px.scatter(sardine_data, x='CatchLbs', y='Count', trendline="ols", labels = {"Count" : 'Sardine Abundance'}, title ='Sardine Larvae lbs vs Sardine Catch')
fig.show()
print("Pearson Correlation:", stats.pearsonr(X, Y))