- 개요 : 수면 패턴, 습관을 토대로 수면 점수를 예측하고 습관을 고쳐주는 AI
- 제작 이유 : 수면이 부족하진 않지만 왜인가 계속해서 피곤해지는 이유를 찾다가 알게된 데이터를 바탕으로 수면의 질을 판단하여 점수를 매기는 AI를 만들어보기로 하였다.
- 제작 시간 : 2024.09.03 ~
- 핵심 기술
- 수면 점수 예측
- 수면 습관 조절
제작 과정
데이터 수집 : kaggle 데이터
데이터 관계성 분석, 시각화
# 모듈 import
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
# 데이터 로드
df = pd.read_csv('sleep_quality.csv')
# 상관관계 행렬 계산
corr_matrix = df.corr()
# 상관관계 히트맵 시각화
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.title('Correlation Matrix')
plt.show()
# 독립 변수와 종속 변수 정의
X = df[['Caffeine_Intake_mg', 'Stress_Level', 'Body_Temperature', 'Movement_During_Sleep',
'Sleep_Duration_Hours', 'Bedtime_Consistency', 'Light_Exposure_hours']]
y = df['Sleep_Quality_Score']
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 회귀 모델 적합
model = sm.OLS(y_train, sm.add_constant(X_train)).fit()
# 회귀 분석 결과 출력
print("Linear Regression Model Summary:")
print(model.summary())
# 다항 회귀 모델 설정
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_train[['Caffeine_Intake_mg']])
poly_model = LinearRegression().fit(X_poly, y_train)
# 다항 회귀 예측
X_range = np.linspace(df['Caffeine_Intake_mg'].min(), df['Caffeine_Intake_mg'].max(), 100).reshape(-1, 1)
X_range_poly = poly.transform(X_range)
y_pred = poly_model.predict(X_range_poly)
# 시각화
plt.figure(figsize=(8, 6))
plt.scatter(df['Caffeine_Intake_mg'], df['Sleep_Quality_Score'], color='blue', label='Data')
plt.plot(X_range, y_pred, color='red', label='Polynomial Fit')
plt.xlabel('Caffeine Intake (mg)')
plt.ylabel('Sleep Quality Score')
plt.title('Polynomial Regression Fit')
plt.legend()
plt.show()
상관계수를 살펴본 결과 카페인 섭취량을 제외하고 상관성이 거의 없음을 알 수 있음
정확도, 잔차, 오차범위 등을 계산하여 모델의 성능 비교
# 모듈 import
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import matplotlib.pyplot as plt
# 데이터 로드
df = pd.read_csv('sleep_quality.csv')
# 독립 변수와 종속 변수 정의
X = df[['Caffeine_Intake_mg', 'Stress_Level', 'Body_Temperature', 'Movement_During_Sleep',
'Sleep_Duration_Hours', 'Bedtime_Consistency', 'Light_Exposure_hours']]
y = df['Sleep_Quality_Score']
# 데이터 분할 (훈련 세트와 테스트 세트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 데이터 표준화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 선형 회귀 모델 구축
model = LinearRegression()
model.fit(X_train_scaled, y_train)
# 예측
y_pred = model.predict(X_test_scaled)
# 모델 성능 평가
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")
# 다항 회귀 + Ridge 회귀
poly = PolynomialFeatures(degree=2)
poly_reg = make_pipeline(poly, Ridge(alpha=1.0))
# 모델 훈련
poly_reg.fit(X_train_scaled, y_train)
# 예측
y_poly_pred = poly_reg.predict(X_test_scaled)
# 성능 평가
mse_poly = mean_squared_error(y_test, y_poly_pred)
r2_poly = r2_score(y_test, y_poly_pred)
print(f"Polynomial Mean Squared Error: {mse_poly:.2f}")
print(f"Polynomial R^2 Score: {r2_poly:.2f}")
# 교차 검증
cv_scores = cross_val_score(poly_reg, X, y, cv=5, scoring='r2')
print(f"Cross-validated R^2 Scores: {cv_scores}")
print(f"Mean R^2 Score: {cv_scores.mean():.2f}")
# Grid Search를 사용한 하이퍼파라미터 튜닝
param_grid = {
'polynomialfeatures__degree': [2, 3, 4],
'ridge__alpha': [0.1, 1.0, 10.0]
}
grid_search = GridSearchCV(estimator=poly_reg, param_grid=param_grid, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best R^2 Score:", grid_search.best_score_)
# 랜덤 포레스트 모델
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
y_rf_pred = rf_model.predict(X_test_scaled)
# Gradient Boosting 모델
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train_scaled, y_train)
y_gb_pred = gb_model.predict(X_test_scaled)
# 성능 평가
print(f"Random Forest Mean Squared Error: {mean_squared_error(y_test, y_rf_pred):.2f}")
print(f"Random Forest R^2 Score: {r2_score(y_test, y_rf_pred):.2f}")
print(f"Gradient Boosting Mean Squared Error: {mean_squared_error(y_test, y_gb_pred):.2f}")
print(f"Gradient Boosting R^2 Score: {r2_score(y_test, y_gb_pred):.2f}")
# Random Forest 하이퍼파라미터 튜닝
rf_param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10]
}
rf_grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42), param_grid=rf_param_grid, cv=5, scoring='r2')
rf_grid_search.fit(X_train_scaled, y_train)
print(f"Best Random Forest Parameters: {rf_grid_search.best_params_}")
# Gradient Boosting 하이퍼파라미터 튜닝
gb_param_grid = {
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 4, 5]
}
gb_grid_search = GridSearchCV(estimator=GradientBoostingRegressor(random_state=42), param_grid=gb_param_grid, cv=5, scoring='r2')
gb_grid_search.fit(X_train_scaled, y_train)
print(f"Best Gradient Boosting Parameters: {gb_grid_search.best_params_}")
# Random Forest 특성 중요도
rf_model = rf_grid_search.best_estimator_
feature_importances = rf_model.feature_importances_
plt.figure(figsize=(12, 8))
plt.barh(X.columns, feature_importances)
plt.xlabel('Feature Importance')
plt.title('Random Forest Feature Importance')
plt.show()
결과
Mean Squared Error: 3.88
R^2 Score: 0.55
Polynomial Mean Squared Error: 0.95
Polynomial R^2 Score: 0.89
Cross-validated R^2 Scores: [0.85715614 0.86917215 0.85860747 0.8749803 0.88692695]
Mean R^2 Score: 0.87
Best Parameters: {'polynomialfeatures__degree': 3, 'ridge__alpha': 0.1}
Best R^2 Score: 0.9564619875220843
Random Forest Mean Squared Error: 0.05
Random Forest R^2 Score: 0.99
Gradient Boosting Mean Squared Error: 0.03
Gradient Boosting R^2 Score: 1.00
Best Random Forest Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best Gradient Boosting Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200}
Gradient Boostring 기법이 가장 우수한 모델임을 알 수 있음
최종 정리
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
# 데이터 로드
df = pd.read_csv('sleep_quality.csv')
# 독립 변수와 종속 변수 정의
X = df[['Caffeine_Intake_mg', 'Stress_Level', 'Body_Temperature', 'Movement_During_Sleep',
'Sleep_Duration_Hours', 'Bedtime_Consistency', 'Light_Exposure_hours']]
y = df['Sleep_Quality_Score']
# 데이터 분할 (훈련 세트와 테스트 세트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 데이터 표준화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Gradient Boosting 모델
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train_scaled, y_train)
y_gb_pred = gb_model.predict(X_test_scaled)
# 성능 평가
mse_gb = mean_squared_error(y_test, y_gb_pred)
r2_gb = r2_score(y_test, y_gb_pred)
print(f"Gradient Boosting Mean Squared Error: {mse_gb:.2f}")
print(f"Gradient Boosting R^2 Score: {r2_gb:.2f}")
# Gradient Boosting 하이퍼파라미터 튜닝
gb_param_grid = {
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 4, 5]
}
gb_grid_search = GridSearchCV(estimator=GradientBoostingRegressor(random_state=42), param_grid=gb_param_grid, cv=5, scoring='r2')
gb_grid_search.fit(X_train_scaled, y_train)
print(f"Best Gradient Boosting Parameters: {gb_grid_search.best_params_}")
print(f"Best Gradient Boosting R^2 Score: {gb_grid_search.best_score_:.2f}")
# 최적의 Gradient Boosting 모델로 다시 예측
best_gb_model = gb_grid_search.best_estimator_
y_best_gb_pred = best_gb_model.predict(X_test_scaled)
# 최적의 Gradient Boosting 모델 성능 평가
mse_best_gb = mean_squared_error(y_test, y_best_gb_pred)
r2_best_gb = r2_score(y_test, y_best_gb_pred)
print(f"Optimized Gradient Boosting Mean Squared Error: {mse_best_gb:.2f}")
print(f"Optimized Gradient Boosting R^2 Score: {r2_best_gb:.2f}")
# Gradient Boosting 특성 중요도 시각화
feature_importances = best_gb_model.feature_importances_
plt.figure(figsize=(12, 8))
plt.barh(X.columns, feature_importances)
plt.xlabel('Feature Importance')
plt.title('Gradient Boosting Feature Importance')
plt.show()
가장 호율적인 모델만을 남겨두고 전부 제거
제작 방향
- 데이터간의 인과성 분석
- 더 많은 데이터 수집
- 수면 데이터
- 수면 습관 데이터
- 수면 시간 데이터
- UI 개발
- 앱
- 더욱 효율적인 모델링
자료 : https://www.kaggle.com/datasets/uom190346a/sleep-and-health-metrics?resource=download
github : https://github.com/sinjaeu/sleep_project
sinjaeu/sleep_project
Contribute to sinjaeu/sleep_project development by creating an account on GitHub.
github.com
'프로젝트 > 수면 프로젝트' 카테고리의 다른 글
수면 프로젝트 - 0920 (완) (1) | 2024.09.20 |
---|