빅분기
6회 3유형_복습-chi2, OLS
dondon-a
2024. 6. 21. 23:56
반응형
3-1
A 도시 남:600, 여:550
흡|남 비율 : 0.2
흡|여 비율 : 0.26
남여간 흡연 여부간 인구비가 다른지, 유의수준 0.05, 귀무기각/채택여부, P-VALUE
- 두개의 독립된 그룹
- 비율 검정
카이제곱 검정
from scipy.stats import chi2_contingency
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
# 관찰된 빈도 테이블 생성
##데이터구조는 ARRAY안에 리스트를 넣네
# 카이제곱 독립성 검정 수행
# 귀무는 두변수는 독립이다. = 비율이 같다.
m=600
f=550
s_when_m=0.2
s_when_f=0.26
smokingm=m*s_when_m
nm=m*(1-s_when_m)
smokingf=f*s_when_f
nf=f*(1-s_when_f)
# 관찰 흡연 / 비흡연
#남 sm/ nm
#여 sf/ nf
ctgc_t=[[smokingm,nm],[smokingf,nf]] #이정도는 유연한듯
ctgc_t=pd.DataFrame(ctgc_t)
chi2_stat, p_val, dof, expected = chi2_contingency(ctgc_t)
chi2_stat, p_val, dof, expected
#기대도수는 흡연여부가 성별에 차이가 없는 경우 (독립인 경우)로 계산해서 만들어진다 (흡연자/총인원수)로 했을듯
(5.521247671393307,
0.018786854975740765,
1,
array([[137.2173913, 462.7826087],
[125.7826087, 424.2173913]]))
print(dir(chi2_contingency(ctgc_t))) #빈 모델을 넣으면 아무것도 안나온다.
['__add__', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__getnewargs_ex__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmul__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_asdict', '_extra_fields', '_fields', 'count', 'dof', 'expected_freq', 'index', 'pvalue', 'statistic']
print('기각',p_val.round(3))
기각 0.019
type(p_val)
#numpy 개체로 인식되어서 round바로 먹음
numpy.float64
age와 Cholesterol간 weight 예측 선형 회귀 모델 / age의 회귀 계수를 구하여라
age가 고정일 때 Cholesterol와 weight가 선형관계에 있다는 가설을 유의수준 0.05하에 검정하라
age가 55, Cholesterol가 72.6일때 위 모델을 기반으로 weight값을 예측하라.
import pandas as pd
df= pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/adp/28/p7.csv')
df.head()
age | Cholesterol | weight | |
---|---|---|---|
0 | 65 | 69.0 | 111.0 |
1 | 54 | 117.0 | 81.0 |
2 | 61 | 86.2 | 72.0 |
3 | 57 | 76.0 | 78.0 |
4 | 62 | 160.0 | 61.0 |
df.columns
Index(['age', 'Cholesterol', 'weight'], dtype='object')
import numpy as np
import statsmodels.api as sm
# from sklearn.linear_model import LinearRegression
X= df[['age','Cholesterol']]
y=df['weight']
# X=sm.add_constant(X)
model = sm.OLS(y,X).fit()
print(model.params)
print(f'< r squared 값: {model.rsquared} >')
print(model.pvalues)
age 0.771308
Cholesterol 0.328197
dtype: float64
< r squared 값: 0.9399805187902774 >
age 8.671425e-45
Cholesterol 9.225280e-38
dtype: float64
s1=set(dir(sm.OLS(y,X).fit()))
print(dir(sm.OLS(y,X).fit()))
['HC0_se', 'HC1_se', 'HC2_se', 'HC3_se', '_HCCM', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abat_diagonal', '_cache', '_data_attr', '_data_in_cache', '_get_robustcov_results', '_is_nested', '_use_t', '_wexog_singular_values', 'aic', 'bic', 'bse', 'centered_tss', 'compare_f_test', 'compare_lm_test', 'compare_lr_test', 'condition_number', 'conf_int', 'conf_int_el', 'cov_HC0', 'cov_HC1', 'cov_HC2', 'cov_HC3', 'cov_kwds', 'cov_params', 'cov_type', 'df_model', 'df_resid', 'eigenvals', 'el_test', 'ess', 'f_pvalue', 'f_test', 'fittedvalues', 'fvalue', 'get_influence', 'get_prediction', 'get_robustcov_results', 'info_criteria', 'initialize', 'k_constant', 'llf', 'load', 'model', 'mse_model', 'mse_resid', 'mse_total', 'nobs', 'normalized_cov_params', 'outlier_test', 'params', 'predict', 'pvalues', 'remove_data', 'resid', 'resid_pearson', 'rsquared', 'rsquared_adj', 'save', 'scale', 'ssr', 'summary', 'summary2', 't_test', 't_test_pairwise', 'tvalues', 'uncentered_tss', 'use_t', 'wald_test', 'wald_test_terms', 'wresid']
s2=set(dir(sm.OLS(y,X))) #얘도 사실상 빈놈이야!!!
print(dir(sm.OLS(y,X)))
['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_check_kwargs', '_data_attr', '_df_model', '_df_resid', '_fit_collinear', '_fit_ridge', '_fit_zeros', '_formula_max_endog', '_get_init_kwds', '_handle_data', '_init_keys', '_kwargs_allowed', '_setup_score_hess', '_sqrt_lasso', 'data', 'df_model', 'df_resid', 'endog', 'endog_names', 'exog', 'exog_names', 'fit', 'fit_regularized', 'from_formula', 'get_distribution', 'hessian', 'hessian_factor', 'information', 'initialize', 'k_constant', 'loglike', 'nobs', 'predict', 'rank', 'score', 'weights', 'wendog', 'wexog', 'whiten']
print(s1-s2) #빈 모델 대비 생기는 것들
{'HC1_se', '_wexog_singular_values', 'cov_params', 'resid_pearson', 'outlier_test', 'condition_number', 'resid', 'rsquared_adj', 'f_test', '_HCCM', 'save', 'aic', 'HC2_se', 'rsquared', 'wald_test', 'get_influence', 'pvalues', 'cov_HC2', 'ssr', 'normalized_cov_params', 'model', 'fittedvalues', 'fvalue', 'eigenvals', '_data_in_cache', 'scale', 'tvalues', 'cov_HC1', 'wresid', 'conf_int_el', 'info_criteria', 'cov_HC3', 'llf', 'get_prediction', 'bse', 'conf_int', 'mse_model', '_get_robustcov_results', 'ess', 'use_t', '_abat_diagonal', 'centered_tss', 'summary', 'mse_resid', '_use_t', 'f_pvalue', 'HC0_se', 'get_robustcov_results', 'cov_kwds', 'cov_HC0', 'uncentered_tss', 'compare_f_test', 'compare_lr_test', 'el_test', '_is_nested', 'compare_lm_test', 'summary2', '_cache', 'HC3_se', 'load', 'mse_total', 't_test_pairwise', 't_test', 'params', 'bic', 'wald_test_terms', 'remove_data', 'cov_type'}
#sm.add_constant 쓴 버전
X= df[['age','Cholesterol']]
y=df['weight']
X=sm.add_constant(X)
model = sm.OLS(y,X).fit() #fit까지 한번에 해버리는게 나아
print(model.params)
print(f'< r squared 값: {model.rsquared} >')
print(model.pvalues)
#add_constant의 필요여부 판단은 rsquared 찍어보면 된다. 1에 가까울수로 좋은겨 이 경우는 별로야
const 74.895281
age -0.036102
Cholesterol 0.081929
dtype: float64
< r squared 값: 0.04388934075014217 >
const 2.951249e-46
age 5.419233e-01
Cholesterol 2.377808e-04
dtype: float64
model.summary()
Dep. Variable: | weight | R-squared (uncentered): | 0.940 |
---|---|---|---|
Model: | OLS | Adj. R-squared (uncentered): | 0.940 |
Method: | Least Squares | F-statistic: | 2592. |
Date: | Fri, 21 Jun 2024 | Prob (F-statistic): | 6.42e-203 |
Time: | 22:16:17 | Log-Likelihood: | -1477.9 |
No. Observations: | 333 | AIC: | 2960. |
Df Residuals: | 331 | BIC: | 2967. |
Df Model: | 2 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
age | 0.7713 | 0.047 | 16.433 | 0.000 | 0.679 | 0.864 |
Cholesterol | 0.3282 | 0.022 | 14.638 | 0.000 | 0.284 | 0.372 |
Omnibus: | 1.639 | Durbin-Watson: | 1.951 |
---|---|---|---|
Prob(Omnibus): | 0.441 | Jarque-Bera (JB): | 1.378 |
Skew: | 0.100 | Prob(JB): | 0.502 |
Kurtosis: | 3.244 | Cond. No. | 5.93 |
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(dir(model))
['HC0_se', 'HC1_se', 'HC2_se', 'HC3_se', '_HCCM', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abat_diagonal', '_cache', '_data_attr', '_data_in_cache', '_get_robustcov_results', '_is_nested', '_use_t', '_wexog_singular_values', 'aic', 'bic', 'bse', 'centered_tss', 'compare_f_test', 'compare_lm_test', 'compare_lr_test', 'condition_number', 'conf_int', 'conf_int_el', 'cov_HC0', 'cov_HC1', 'cov_HC2', 'cov_HC3', 'cov_kwds', 'cov_params', 'cov_type', 'df_model', 'df_resid', 'eigenvals', 'el_test', 'ess', 'f_pvalue', 'f_test', 'fittedvalues', 'fvalue', 'get_influence', 'get_prediction', 'get_robustcov_results', 'info_criteria', 'initialize', 'k_constant', 'llf', 'load', 'model', 'mse_model', 'mse_resid', 'mse_total', 'nobs', 'normalized_cov_params', 'outlier_test', 'params', 'predict', 'pvalues', 'remove_data', 'resid', 'resid_pearson', 'rsquared', 'rsquared_adj', 'save', 'scale', 'ssr', 'summary', 'summary2', 't_test', 't_test_pairwise', 'tvalues', 'uncentered_tss', 'use_t', 'wald_test', 'wald_test_terms', 'wresid']
model.pvalues['Cholesterol'] #숫자 슬라이싱도 되고 인덱스 이름으로도 되는 듯
9.22527952319271e-38
pred=model.predict([1,55,72.6]) #X 칼럼 넣은 순서대로 넣으면 된다
pred
array([78.85771011])
pred[0]
78.85771011344595
print(model.rsquared)
0.04388934075014217
from scipy.stats import chi2_contingency, statsmodels.api as sm / OLS, Logit
반응형