3-1

A 도시 남:600, 여:550
흡|남 비율 : 0.2
흡|여 비율 : 0.26

   남여간 흡연 여부간 인구비가 다른지, 유의수준 0.05, 귀무기각/채택여부, P-VALUE

두개의 독립된 그룹
비율 검정
카이제곱 검정

from scipy.stats import chi2_contingency

import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

# 관찰된 빈도 테이블 생성 
##데이터구조는 ARRAY안에 리스트를 넣네
# 카이제곱 독립성 검정 수행
# 귀무는 두변수는 독립이다. = 비율이 같다.

m=600
f=550
s_when_m=0.2
s_when_f=0.26
smokingm=m*s_when_m
nm=m*(1-s_when_m)
smokingf=f*s_when_f
nf=f*(1-s_when_f)

#  관찰 흡연 / 비흡연
#남  sm/ nm
#여  sf/ nf
ctgc_t=[[smokingm,nm],[smokingf,nf]] #이정도는 유연한듯
ctgc_t=pd.DataFrame(ctgc_t)

chi2_stat, p_val, dof, expected = chi2_contingency(ctgc_t)
chi2_stat, p_val, dof, expected
#기대도수는 흡연여부가 성별에 차이가 없는 경우 (독립인 경우)로 계산해서 만들어진다 (흡연자/총인원수)로 했을듯

(5.521247671393307,
 0.018786854975740765,
 1,
 array([[137.2173913, 462.7826087],
        [125.7826087, 424.2173913]]))

print(dir(chi2_contingency(ctgc_t))) #빈 모델을 넣으면 아무것도 안나온다.

['__add__', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__getnewargs_ex__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmul__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_asdict', '_extra_fields', '_fields', 'count', 'dof', 'expected_freq', 'index', 'pvalue', 'statistic']

print('기각',p_val.round(3))

기각 0.019

type(p_val)
#numpy 개체로 인식되어서 round바로 먹음

numpy.float64

age와 Cholesterol간 weight 예측 선형 회귀 모델 / age의 회귀 계수를 구하여라

age가 고정일 때 Cholesterol와 weight가 선형관계에 있다는 가설을 유의수준 0.05하에 검정하라

age가 55, Cholesterol가 72.6일때 위 모델을 기반으로 weight값을 예측하라.

import pandas as pd 
df= pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/adp/28/p7.csv')
df.head()

	age	Cholesterol	weight
0	65	69.0	111.0
1	54	117.0	81.0
2	61	86.2	72.0
3	57	76.0	78.0
4	62	160.0	61.0

df.columns

Index(['age', 'Cholesterol', 'weight'], dtype='object')

import numpy as np
import statsmodels.api as sm
# from sklearn.linear_model import LinearRegression

X= df[['age','Cholesterol']]
y=df['weight']
# X=sm.add_constant(X)
model = sm.OLS(y,X).fit()
print(model.params)
print(f'< r squared 값: {model.rsquared} >')
print(model.pvalues)

age            0.771308
Cholesterol    0.328197
dtype: float64
< r squared 값: 0.9399805187902774 >
age            8.671425e-45
Cholesterol    9.225280e-38
dtype: float64

s1=set(dir(sm.OLS(y,X).fit()))
print(dir(sm.OLS(y,X).fit()))

['HC0_se', 'HC1_se', 'HC2_se', 'HC3_se', '_HCCM', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abat_diagonal', '_cache', '_data_attr', '_data_in_cache', '_get_robustcov_results', '_is_nested', '_use_t', '_wexog_singular_values', 'aic', 'bic', 'bse', 'centered_tss', 'compare_f_test', 'compare_lm_test', 'compare_lr_test', 'condition_number', 'conf_int', 'conf_int_el', 'cov_HC0', 'cov_HC1', 'cov_HC2', 'cov_HC3', 'cov_kwds', 'cov_params', 'cov_type', 'df_model', 'df_resid', 'eigenvals', 'el_test', 'ess', 'f_pvalue', 'f_test', 'fittedvalues', 'fvalue', 'get_influence', 'get_prediction', 'get_robustcov_results', 'info_criteria', 'initialize', 'k_constant', 'llf', 'load', 'model', 'mse_model', 'mse_resid', 'mse_total', 'nobs', 'normalized_cov_params', 'outlier_test', 'params', 'predict', 'pvalues', 'remove_data', 'resid', 'resid_pearson', 'rsquared', 'rsquared_adj', 'save', 'scale', 'ssr', 'summary', 'summary2', 't_test', 't_test_pairwise', 'tvalues', 'uncentered_tss', 'use_t', 'wald_test', 'wald_test_terms', 'wresid']

s2=set(dir(sm.OLS(y,X))) #얘도 사실상 빈놈이야!!!
print(dir(sm.OLS(y,X)))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_check_kwargs', '_data_attr', '_df_model', '_df_resid', '_fit_collinear', '_fit_ridge', '_fit_zeros', '_formula_max_endog', '_get_init_kwds', '_handle_data', '_init_keys', '_kwargs_allowed', '_setup_score_hess', '_sqrt_lasso', 'data', 'df_model', 'df_resid', 'endog', 'endog_names', 'exog', 'exog_names', 'fit', 'fit_regularized', 'from_formula', 'get_distribution', 'hessian', 'hessian_factor', 'information', 'initialize', 'k_constant', 'loglike', 'nobs', 'predict', 'rank', 'score', 'weights', 'wendog', 'wexog', 'whiten']

print(s1-s2) #빈 모델 대비 생기는 것들

{'HC1_se', '_wexog_singular_values', 'cov_params', 'resid_pearson', 'outlier_test', 'condition_number', 'resid', 'rsquared_adj', 'f_test', '_HCCM', 'save', 'aic', 'HC2_se', 'rsquared', 'wald_test', 'get_influence', 'pvalues', 'cov_HC2', 'ssr', 'normalized_cov_params', 'model', 'fittedvalues', 'fvalue', 'eigenvals', '_data_in_cache', 'scale', 'tvalues', 'cov_HC1', 'wresid', 'conf_int_el', 'info_criteria', 'cov_HC3', 'llf', 'get_prediction', 'bse', 'conf_int', 'mse_model', '_get_robustcov_results', 'ess', 'use_t', '_abat_diagonal', 'centered_tss', 'summary', 'mse_resid', '_use_t', 'f_pvalue', 'HC0_se', 'get_robustcov_results', 'cov_kwds', 'cov_HC0', 'uncentered_tss', 'compare_f_test', 'compare_lr_test', 'el_test', '_is_nested', 'compare_lm_test', 'summary2', '_cache', 'HC3_se', 'load', 'mse_total', 't_test_pairwise', 't_test', 'params', 'bic', 'wald_test_terms', 'remove_data', 'cov_type'}

#sm.add_constant 쓴 버전
X= df[['age','Cholesterol']]
y=df['weight']
X=sm.add_constant(X)
model = sm.OLS(y,X).fit() #fit까지 한번에 해버리는게 나아
print(model.params)
print(f'< r squared 값: {model.rsquared} >')
print(model.pvalues)

#add_constant의 필요여부 판단은 rsquared 찍어보면 된다. 1에 가까울수로 좋은겨 이 경우는 별로야

const          74.895281
age            -0.036102
Cholesterol     0.081929
dtype: float64
< r squared 값: 0.04388934075014217 >
const          2.951249e-46
age            5.419233e-01
Cholesterol    2.377808e-04
dtype: float64

model.summary()

OLS Regression Results
Dep. Variable:	weight	R-squared (uncentered):	0.940
Model:	OLS	Adj. R-squared (uncentered):	0.940
Method:	Least Squares	F-statistic:	2592.
Date:	Fri, 21 Jun 2024	Prob (F-statistic):	6.42e-203
Time:	22:16:17	Log-Likelihood:	-1477.9
No. Observations:	333	AIC:	2960.
Df Residuals:	331	BIC:	2967.
Df Model:	2
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
age	0.7713	0.047	16.433	0.000	0.679	0.864
Cholesterol	0.3282	0.022	14.638	0.000	0.284	0.372

Omnibus:	1.639	Durbin-Watson:	1.951
Prob(Omnibus):	0.441	Jarque-Bera (JB):	1.378
Skew:	0.100	Prob(JB):	0.502
Kurtosis:	3.244	Cond. No.	5.93

Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.

print(dir(model))

['HC0_se', 'HC1_se', 'HC2_se', 'HC3_se', '_HCCM', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abat_diagonal', '_cache', '_data_attr', '_data_in_cache', '_get_robustcov_results', '_is_nested', '_use_t', '_wexog_singular_values', 'aic', 'bic', 'bse', 'centered_tss', 'compare_f_test', 'compare_lm_test', 'compare_lr_test', 'condition_number', 'conf_int', 'conf_int_el', 'cov_HC0', 'cov_HC1', 'cov_HC2', 'cov_HC3', 'cov_kwds', 'cov_params', 'cov_type', 'df_model', 'df_resid', 'eigenvals', 'el_test', 'ess', 'f_pvalue', 'f_test', 'fittedvalues', 'fvalue', 'get_influence', 'get_prediction', 'get_robustcov_results', 'info_criteria', 'initialize', 'k_constant', 'llf', 'load', 'model', 'mse_model', 'mse_resid', 'mse_total', 'nobs', 'normalized_cov_params', 'outlier_test', 'params', 'predict', 'pvalues', 'remove_data', 'resid', 'resid_pearson', 'rsquared', 'rsquared_adj', 'save', 'scale', 'ssr', 'summary', 'summary2', 't_test', 't_test_pairwise', 'tvalues', 'uncentered_tss', 'use_t', 'wald_test', 'wald_test_terms', 'wresid']

model.pvalues['Cholesterol'] #숫자 슬라이싱도 되고 인덱스 이름으로도 되는 듯

9.22527952319271e-38

pred=model.predict([1,55,72.6]) #X 칼럼 넣은 순서대로 넣으면 된다
pred

array([78.85771011])

pred[0]

78.85771011344595

print(model.rsquared)

0.04388934075014217

from scipy.stats import chi2_contingency,
statsmodels.api as sm / OLS, Logit