빅분기

6회 3유형_복습-chi2, OLS

dondon-a 2024. 6. 21. 23:56
반응형

3-1

A 도시 남:600, 여:550
흡|남 비율 : 0.2
흡|여 비율 : 0.26

   남여간 흡연 여부간 인구비가 다른지, 유의수준 0.05, 귀무기각/채택여부, P-VALUE
  • 두개의 독립된 그룹
  • 비율 검정

    카이제곱 검정

    from scipy.stats import chi2_contingency

import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

# 관찰된 빈도 테이블 생성 
##데이터구조는 ARRAY안에 리스트를 넣네
# 카이제곱 독립성 검정 수행
# 귀무는 두변수는 독립이다. = 비율이 같다.

m=600
f=550
s_when_m=0.2
s_when_f=0.26
smokingm=m*s_when_m
nm=m*(1-s_when_m)
smokingf=f*s_when_f
nf=f*(1-s_when_f)

#  관찰 흡연 / 비흡연
#남  sm/ nm
#여  sf/ nf
ctgc_t=[[smokingm,nm],[smokingf,nf]] #이정도는 유연한듯
ctgc_t=pd.DataFrame(ctgc_t)
chi2_stat, p_val, dof, expected = chi2_contingency(ctgc_t)
chi2_stat, p_val, dof, expected
#기대도수는 흡연여부가 성별에 차이가 없는 경우 (독립인 경우)로 계산해서 만들어진다 (흡연자/총인원수)로 했을듯
(5.521247671393307,
 0.018786854975740765,
 1,
 array([[137.2173913, 462.7826087],
        [125.7826087, 424.2173913]]))
print(dir(chi2_contingency(ctgc_t))) #빈 모델을 넣으면 아무것도 안나온다.
['__add__', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__getnewargs_ex__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmul__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_asdict', '_extra_fields', '_fields', 'count', 'dof', 'expected_freq', 'index', 'pvalue', 'statistic']
print('기각',p_val.round(3))
기각 0.019
type(p_val)
#numpy 개체로 인식되어서 round바로 먹음
numpy.float64

age와 Cholesterol간 weight 예측 선형 회귀 모델 / age의 회귀 계수를 구하여라

age가 고정일 때 Cholesterol와 weight가 선형관계에 있다는 가설을 유의수준 0.05하에 검정하라

age가 55, Cholesterol가 72.6일때 위 모델을 기반으로 weight값을 예측하라.

import pandas as pd 
df= pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/adp/28/p7.csv')
df.head()

age Cholesterol weight
0 65 69.0 111.0
1 54 117.0 81.0
2 61 86.2 72.0
3 57 76.0 78.0
4 62 160.0 61.0
df.columns
Index(['age', 'Cholesterol', 'weight'], dtype='object')
import numpy as np
import statsmodels.api as sm
# from sklearn.linear_model import LinearRegression 
X= df[['age','Cholesterol']]
y=df['weight']
# X=sm.add_constant(X)
model = sm.OLS(y,X).fit()
print(model.params)
print(f'< r squared 값: {model.rsquared} >')
print(model.pvalues)
age            0.771308
Cholesterol    0.328197
dtype: float64
< r squared 값: 0.9399805187902774 >
age            8.671425e-45
Cholesterol    9.225280e-38
dtype: float64
s1=set(dir(sm.OLS(y,X).fit()))
print(dir(sm.OLS(y,X).fit()))
['HC0_se', 'HC1_se', 'HC2_se', 'HC3_se', '_HCCM', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abat_diagonal', '_cache', '_data_attr', '_data_in_cache', '_get_robustcov_results', '_is_nested', '_use_t', '_wexog_singular_values', 'aic', 'bic', 'bse', 'centered_tss', 'compare_f_test', 'compare_lm_test', 'compare_lr_test', 'condition_number', 'conf_int', 'conf_int_el', 'cov_HC0', 'cov_HC1', 'cov_HC2', 'cov_HC3', 'cov_kwds', 'cov_params', 'cov_type', 'df_model', 'df_resid', 'eigenvals', 'el_test', 'ess', 'f_pvalue', 'f_test', 'fittedvalues', 'fvalue', 'get_influence', 'get_prediction', 'get_robustcov_results', 'info_criteria', 'initialize', 'k_constant', 'llf', 'load', 'model', 'mse_model', 'mse_resid', 'mse_total', 'nobs', 'normalized_cov_params', 'outlier_test', 'params', 'predict', 'pvalues', 'remove_data', 'resid', 'resid_pearson', 'rsquared', 'rsquared_adj', 'save', 'scale', 'ssr', 'summary', 'summary2', 't_test', 't_test_pairwise', 'tvalues', 'uncentered_tss', 'use_t', 'wald_test', 'wald_test_terms', 'wresid']
s2=set(dir(sm.OLS(y,X))) #얘도 사실상 빈놈이야!!!
print(dir(sm.OLS(y,X)))
['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_check_kwargs', '_data_attr', '_df_model', '_df_resid', '_fit_collinear', '_fit_ridge', '_fit_zeros', '_formula_max_endog', '_get_init_kwds', '_handle_data', '_init_keys', '_kwargs_allowed', '_setup_score_hess', '_sqrt_lasso', 'data', 'df_model', 'df_resid', 'endog', 'endog_names', 'exog', 'exog_names', 'fit', 'fit_regularized', 'from_formula', 'get_distribution', 'hessian', 'hessian_factor', 'information', 'initialize', 'k_constant', 'loglike', 'nobs', 'predict', 'rank', 'score', 'weights', 'wendog', 'wexog', 'whiten']
print(s1-s2) #빈 모델 대비 생기는 것들
{'HC1_se', '_wexog_singular_values', 'cov_params', 'resid_pearson', 'outlier_test', 'condition_number', 'resid', 'rsquared_adj', 'f_test', '_HCCM', 'save', 'aic', 'HC2_se', 'rsquared', 'wald_test', 'get_influence', 'pvalues', 'cov_HC2', 'ssr', 'normalized_cov_params', 'model', 'fittedvalues', 'fvalue', 'eigenvals', '_data_in_cache', 'scale', 'tvalues', 'cov_HC1', 'wresid', 'conf_int_el', 'info_criteria', 'cov_HC3', 'llf', 'get_prediction', 'bse', 'conf_int', 'mse_model', '_get_robustcov_results', 'ess', 'use_t', '_abat_diagonal', 'centered_tss', 'summary', 'mse_resid', '_use_t', 'f_pvalue', 'HC0_se', 'get_robustcov_results', 'cov_kwds', 'cov_HC0', 'uncentered_tss', 'compare_f_test', 'compare_lr_test', 'el_test', '_is_nested', 'compare_lm_test', 'summary2', '_cache', 'HC3_se', 'load', 'mse_total', 't_test_pairwise', 't_test', 'params', 'bic', 'wald_test_terms', 'remove_data', 'cov_type'}
#sm.add_constant 쓴 버전
X= df[['age','Cholesterol']]
y=df['weight']
X=sm.add_constant(X)
model = sm.OLS(y,X).fit() #fit까지 한번에 해버리는게 나아
print(model.params)
print(f'< r squared 값: {model.rsquared} >')
print(model.pvalues)

#add_constant의 필요여부 판단은 rsquared 찍어보면 된다. 1에 가까울수로 좋은겨 이 경우는 별로야
const          74.895281
age            -0.036102
Cholesterol     0.081929
dtype: float64
< r squared 값: 0.04388934075014217 >
const          2.951249e-46
age            5.419233e-01
Cholesterol    2.377808e-04
dtype: float64
model.summary()
OLS Regression Results
Dep. Variable: weight R-squared (uncentered): 0.940
Model: OLS Adj. R-squared (uncentered): 0.940
Method: Least Squares F-statistic: 2592.
Date: Fri, 21 Jun 2024 Prob (F-statistic): 6.42e-203
Time: 22:16:17 Log-Likelihood: -1477.9
No. Observations: 333 AIC: 2960.
Df Residuals: 331 BIC: 2967.
Df Model: 2
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
age 0.7713 0.047 16.433 0.000 0.679 0.864
Cholesterol 0.3282 0.022 14.638 0.000 0.284 0.372
Omnibus: 1.639 Durbin-Watson: 1.951
Prob(Omnibus): 0.441 Jarque-Bera (JB): 1.378
Skew: 0.100 Prob(JB): 0.502
Kurtosis: 3.244 Cond. No. 5.93


Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(dir(model))
['HC0_se', 'HC1_se', 'HC2_se', 'HC3_se', '_HCCM', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abat_diagonal', '_cache', '_data_attr', '_data_in_cache', '_get_robustcov_results', '_is_nested', '_use_t', '_wexog_singular_values', 'aic', 'bic', 'bse', 'centered_tss', 'compare_f_test', 'compare_lm_test', 'compare_lr_test', 'condition_number', 'conf_int', 'conf_int_el', 'cov_HC0', 'cov_HC1', 'cov_HC2', 'cov_HC3', 'cov_kwds', 'cov_params', 'cov_type', 'df_model', 'df_resid', 'eigenvals', 'el_test', 'ess', 'f_pvalue', 'f_test', 'fittedvalues', 'fvalue', 'get_influence', 'get_prediction', 'get_robustcov_results', 'info_criteria', 'initialize', 'k_constant', 'llf', 'load', 'model', 'mse_model', 'mse_resid', 'mse_total', 'nobs', 'normalized_cov_params', 'outlier_test', 'params', 'predict', 'pvalues', 'remove_data', 'resid', 'resid_pearson', 'rsquared', 'rsquared_adj', 'save', 'scale', 'ssr', 'summary', 'summary2', 't_test', 't_test_pairwise', 'tvalues', 'uncentered_tss', 'use_t', 'wald_test', 'wald_test_terms', 'wresid']
model.pvalues['Cholesterol'] #숫자 슬라이싱도 되고 인덱스 이름으로도 되는 듯
9.22527952319271e-38
pred=model.predict([1,55,72.6]) #X 칼럼 넣은 순서대로 넣으면 된다
pred
array([78.85771011])
pred[0]
78.85771011344595
print(model.rsquared)
0.04388934075014217

from scipy.stats import chi2_contingency,
statsmodels.api as sm / OLS, Logit

반응형