We want to measure the effect of smoking cessation on weight gain using the National Health and Nutrition Examionation Survey Epidemiologic Followup Study (NHEFS) dataset. This study was driven for 11 years (1971-1982). Each subject was weighted twice. The first one was at the begining of the experiment (1971) and the second one, at the end of this experiment in 1982.

Import libraries

import pandas as pd
from causalinference import CausalModel
import seaborn as sns
import matplotlib as plt
import numpy as np
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Load data

data = pd.read_csv("https://cdn1.sph.harvard.edu/wp-content/uploads/sites/1268/1268/20/nhefs.csv")

Exploratory Data Analysis

data.columns

Index(['seqn', 'qsmk', 'death', 'yrdth', 'modth', 'dadth', 'sbp', 'dbp', 'sex',
       'age', 'race', 'income', 'marital', 'school', 'education', 'ht', 'wt71',
       'wt82', 'wt82_71', 'birthplace', 'smokeintensity', 'smkintensity82_71',
       'smokeyrs', 'asthma', 'bronch', 'tb', 'hf', 'hbp', 'pepticulcer',
       'colitis', 'hepatitis', 'chroniccough', 'hayfever', 'diabetes', 'polio',
       'tumor', 'nervousbreak', 'alcoholpy', 'alcoholfreq', 'alcoholtype',
       'alcoholhowmuch', 'pica', 'headache', 'otherpain', 'weakheart',
       'allergies', 'nerves', 'lackpep', 'hbpmed', 'boweltrouble', 'wtloss',
       'infection', 'active', 'exercise', 'birthcontrol', 'pregnancies',
       'cholesterol', 'hightax82', 'price71', 'price82', 'tax71', 'tax82',
       'price71_82', 'tax71_82'],
      dtype='object')

data.shape

(1629, 64)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1629 entries, 0 to 1628
Data columns (total 64 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   seqn               1629 non-null   int64  
 1   qsmk               1629 non-null   int64  
 2   death              1629 non-null   int64  
 3   yrdth              318 non-null    float64
 4   modth              322 non-null    float64
 5   dadth              322 non-null    float64
 6   sbp                1552 non-null   float64
 7   dbp                1548 non-null   float64
 8   sex                1629 non-null   int64  
 9   age                1629 non-null   int64  
 10  race               1629 non-null   int64  
 11  income             1567 non-null   float64
 12  marital            1629 non-null   int64  
 13  school             1629 non-null   int64  
 14  education          1629 non-null   int64  
 15  ht                 1629 non-null   float64
 16  wt71               1629 non-null   float64
 17  wt82               1566 non-null   float64
 18  wt82_71            1566 non-null   float64
 19  birthplace         1537 non-null   float64
 20  smokeintensity     1629 non-null   int64  
 21  smkintensity82_71  1629 non-null   int64  
 22  smokeyrs           1629 non-null   int64  
 23  asthma             1629 non-null   int64  
 24  bronch             1629 non-null   int64  
 25  tb                 1629 non-null   int64  
 26  hf                 1629 non-null   int64  
 27  hbp                1629 non-null   int64  
 28  pepticulcer        1629 non-null   int64  
 29  colitis            1629 non-null   int64  
 30  hepatitis          1629 non-null   int64  
 31  chroniccough       1629 non-null   int64  
 32  hayfever           1629 non-null   int64  
 33  diabetes           1629 non-null   int64  
 34  polio              1629 non-null   int64  
 35  tumor              1629 non-null   int64  
 36  nervousbreak       1629 non-null   int64  
 37  alcoholpy          1629 non-null   int64  
 38  alcoholfreq        1629 non-null   int64  
 39  alcoholtype        1629 non-null   int64  
 40  alcoholhowmuch     1212 non-null   float64
 41  pica               1629 non-null   int64  
 42  headache           1629 non-null   int64  
 43  otherpain          1629 non-null   int64  
 44  weakheart          1629 non-null   int64  
 45  allergies          1629 non-null   int64  
 46  nerves             1629 non-null   int64  
 47  lackpep            1629 non-null   int64  
 48  hbpmed             1629 non-null   int64  
 49  boweltrouble       1629 non-null   int64  
 50  wtloss             1629 non-null   int64  
 51  infection          1629 non-null   int64  
 52  active             1629 non-null   int64  
 53  exercise           1629 non-null   int64  
 54  birthcontrol       1629 non-null   int64  
 55  pregnancies        726 non-null    float64
 56  cholesterol        1613 non-null   float64
 57  hightax82          1537 non-null   float64
 58  price71            1537 non-null   float64
 59  price82            1537 non-null   float64
 60  tax71              1537 non-null   float64
 61  tax82              1537 non-null   float64
 62  price71_82         1537 non-null   float64
 63  tax71_82           1537 non-null   float64
dtypes: float64(21), int64(43)
memory usage: 814.6 KB

data.head()

data.describe()

data.isnull().sum()

seqn                    0
qsmk                    0
death                   0
yrdth                1311
modth                1307
dadth                1307
sbp                    77
dbp                    81
sex                     0
age                     0
race                    0
income                 62
marital                 0
school                  0
education               0
ht                      0
wt71                    0
wt82                   63
wt82_71                63
birthplace             92
smokeintensity          0
smkintensity82_71       0
smokeyrs                0
asthma                  0
bronch                  0
tb                      0
hf                      0
hbp                     0
pepticulcer             0
colitis                 0
hepatitis               0
chroniccough            0
hayfever                0
diabetes                0
polio                   0
tumor                   0
nervousbreak            0
alcoholpy               0
alcoholfreq             0
alcoholtype             0
alcoholhowmuch        417
pica                    0
headache                0
otherpain               0
weakheart               0
allergies               0
nerves                  0
lackpep                 0
hbpmed                  0
boweltrouble            0
wtloss                  0
infection               0
active                  0
exercise                0
birthcontrol            0
pregnancies           903
cholesterol            16
hightax82              92
price71                92
price82                92
tax71                  92
tax82                  92
price71_82             92
tax71_82               92
dtype: int64

Assumption

We're going to remove people who haven't survived until the end of the experiment in 1982. We can check this fact testing if the field called 'wt82' has some value. It stands for the weight measured at 1982.

data = data.dropna(subset=['wt82'])
data.shape

(1566, 64)

Covariates, Treatment and Outcome variables

From https://cdn1.sph.harvard.edu/wp-content/uploads/sites/1268/1268/20/hernanrobins_v2.17.21.pdf

Our goal is to estimate the average causal effect of smoking cessation (the treatment) A on weight gain (the outcome) Y. To do so, we will use data from 1566 cigarette smokers aged 25-74 years who, as part of the NHEFS, had a baseline visit and a follow-up visit about 10 years later.

The treatment variable (A) is qsmk (quit smoking). Individuals were classified as treated A=1 if they reported having quit smoking before the follow-up visit, and as untreated A=0 otherwise.

data['qsmk'].value_counts(ascending=True)

1     403
0    1163
Name: qsmk, dtype: int64

data.qsmk.value_counts(normalize=True)

0    0.742656
1    0.257344
Name: qsmk, dtype: float64

A = data['qsmk']

The outcome variable (Y) is wt82_71 (the individual's body weight at the follow-up visit in 1982 minus the individual's body weight at the baseline visit in 1971).

data['wt82_71'].value_counts()

 3.057111     3
-4.085807     2
 2.947008     2
 6.232155     2
 3.745454     2
-3.966775     2
 0.227008     2
-7.145221     2
 5.445556     2
-1.020851     2
 3.061378     2
 1.131187     2
-3.176877     2
 2.836231     2
 5.098855     2
-2.490851     2
 5.215747     2
 1.362932     2
 4.080894     2
 3.512741     2
-0.117450     2
 7.595161     2
 0.225072     2
-1.252112     2
-0.568138     2
-1.585221     2
 0.910703     2
 2.384486     2
-2.270265     2
 0.002932     2
 3.514970     2
-1.591819     2
 7.256333     2
 0.909149     2
 4.532932     2
-2.378916     2
 6.348078     2
 8.962932     2
 4.989530     2
-5.443667     2
 1.020703     2
 7.596333     2
-0.451144     2
 0.798078     2
 4.539823     2
 4.650410     2
-1.929297     2
-13.381335    2
-0.113183     2
 6.805072     2
 2.834105     2
 0.908856     2
 5.101964     2
 9.295366     2
 13.044779    2
-3.405412     1
 5.443327     1
 7.936817     1
 7.706817     1
 8.163900     1
 0.340410     1
 3.854486     1
-4.084737     1
 2.835659     1
 3.852932     1
 23.138078    1
 2.036627     1
-1.251438     1
 4.086231     1
 19.953914    1
-5.217259     1
 0.112741     1
-1.136100     1
 3.064970     1
-8.508622     1
-13.944737    1
 12.816143    1
 16.106436    1
 4.880117     1
 1.476040     1
 2.156333     1
 5.551773     1
-6.348813     1
 0.568474     1
-8.394928     1
 6.120410     1
 9.414295     1
 1.243123     1
 0.682639     1
 5.897888     1
 6.120894     1
 20.072946    1
 20.298474    1
-5.781144     1
 37.650512    1
-0.000367     1
-22.338241    1
 4.653606     1
 5.334002     1
 7.821773     1
 10.433034    1
 0.564193     1
 12.585570    1
 6.912550     1
 2.499442     1
 1.591671     1
-3.516673     1
-2.155323     1
 11.228665    1
-8.047640     1
 7.375747     1
-1.020954     1
 8.727301     1
 6.124398     1
 1.932448     1
-1.249590     1
-6.913285     1
 2.157888     1
 2.496245     1
-13.379883    1
 4.993225     1
-1.925705     1
 10.096627    1
 0.451876     1
 7.032843     1
-7.373285     1
 6.126715     1
-2.723960     1
 20.294588    1
-1.019297     1
 9.408181     1
 1.359926     1
 5.780996     1
-10.884546    1
 2.036715     1
 6.917506     1
 6.124105     1
-2.383080     1
 20.526538    1
-3.513183     1
 2.836143     1
-6.346966     1
-0.448697     1
 5.894779     1
-7.259209     1
 5.784779     1
 5.319544     1
-7.373183     1
 4.646920     1
-2.041438     1
 7.712741     1
-4.876291     1
 7.026040     1
-0.230367     1
-10.881922    1
 2.272741     1
 5.103327     1
 13.154588    1
-21.092215    1
 4.646817     1
 10.209633    1
 8.849339     1
 2.493811     1
-2.272112     1
 1.817008     1
 11.116627    1
 6.914193     1
 7.373811     1
 14.738181    1
 7.030512     1
-2.262992     1
 1.137976     1
 13.042257    1
-12.251540    1
 4.420600     1
 20.188181    1
 8.166040     1
 23.023137    1
 12.591671    1
 1.470894     1
 4.310117     1
-2.610177     1
 0.116231     1
-14.291438    1
 2.720117     1
 1.362829     1
 13.949149    1
-5.217361     1
-6.921628     1
 32.092843    1
 2.612932     1
-0.677450     1
 0.457785     1
 0.113225     1
 1.931964     1
 0.004970     1
-14.057655    1
 3.853811     1
-4.986673     1
 4.644677     1
 1.816524     1
 14.746436    1
-2.272215     1
 22.221876    1
 12.809823    1
 10.657404    1
 9.522550     1
 2.496627     1
 6.463137     1
 4.650028     1
-2.723183     1
-4.874253     1
 8.053709     1
 9.070512     1
 6.465454     1
 14.179735    1
-3.630749     1
 5.897301     1
 0.342169     1
 3.286524     1
 13.611378    1
-9.867934     1
 4.764295     1
 15.872257    1
 12.582741    1
 2.726524     1
 12.589735    1
 4.538283     1
 20.182257    1
 0.340996     1
 5.219823     1
-16.778813    1
-7.483476     1
 10.429837    1
-2.831438     1
 7.033811     1
-2.944253     1
 8.161289     1
 6.572448     1
-7.256584     1
-3.625221     1
 2.604398     1
 6.802741     1
-13.952508    1
 2.830117     1
-3.398432     1
 9.067008     1
 1.358665     1
-7.255221     1
 22.455277    1
-0.678432     1
-10.657068    1
 0.457008     1
 1.592448     1
 8.392448     1
 0.117111     1
 1.246231     1
-2.496482     1
-0.225221     1
 6.918283     1
 8.274779     1
 10.542448    1
 15.764207    1
 4.536627     1
 1.243416     1
 1.363327     1
 1.818181     1
 10.436040    1
-1.925895     1
 1.932155     1
 3.060703     1
 1.929633     1
-1.705221     1
 21.771685    1
-2.272596     1
 1.359149     1
-0.796482     1
 12.475072    1
-7.369883     1
 10.435747    1
-10.319297    1
 8.279149     1
-6.122801     1
 28.912550    1
-3.177068     1
 5.783327     1
-16.780954    1
 8.047697     1
 1.132843     1
 7.262550     1
 12.817111    1
 8.508665     1
 4.989823     1
-1.021717     1
 3.854779     1
-1.815602     1
-3.742992     1
 14.631303    1
 9.980410     1
-0.902992     1
 9.974970     1
 1.589046     1
-1.365323     1
 0.449046     1
 12.591099    1
 11.569251    1
-2.949209     1
-5.666482     1
-4.647068     1
-0.115221     1
 3.859530     1
-12.243080    1
 6.121378     1
 2.835454     1
 8.954486     1
 11.228372    1
-12.469986    1
 12.586040    1
 2.723327     1
 3.398855     1
 1.130410     1
 7.142550     1
 9.414486     1
-2.376482     1
 4.423123     1
 3.859633     1
-30.501922    1
-10.429781    1
 11.224105    1
 3.174970     1
 4.313900     1
 9.864779     1
 6.686627     1
 4.078181     1
 6.799823     1
-1.702406     1
 1.250703     1
 5.333518     1
 11.683137    1
 3.285747     1
 10.550805    1
 4.425161     1
-9.072889     1
 1.358855     1
-10.769693    1
 3.403518     1
 6.351671     1
-2.384928     1
-9.642992     1
 9.185175     1
 8.280117     1
 10.432741    1
 12.591201    1
 10.315263    1
 13.490321    1
 13.495556    1
-10.890954    1
 7.254970     1
 0.335849     1
 9.635366     1
 0.568576     1
-0.681424     1
 6.351964     1
 0.450703     1
-0.109883     1
-12.357259    1
 8.392741     1
-0.791628     1
 10.662360    1
 0.231773     1
 0.230703     1
-4.191321     1
 6.235454     1
 13.720805    1
 2.718562     1
 1.816538     1
 4.418958     1
 8.054779     1
 3.629735     1
 5.327697     1
 3.066231     1
-20.075221    1
 4.762066     1
 7.257594     1
-5.667552     1
 5.781084     1
-11.221922    1
 19.506920    1
 4.312932     1
-1.922992     1
-7.824928     1
-8.163373     1
-7.367361     1
 10.206524    1
-0.454928     1
 7.258474     1
-13.490749    1
 11.117111    1
 1.022829     1
 0.109339     1
 0.449442     1
 10.093811    1
 5.672550     1
 2.042755     1
 21.432448    1
 7.367785     1
-11.340661    1
 2.041671     1
-4.879004     1
 3.057785     1
 2.830996     1
-1.931145     1
 7.481289     1
 7.828576     1
-9.301247     1
 4.304588     1
-5.557450     1
-4.418622     1
-3.519400     1
-3.633476     1
-1.278138     1
 3.291862     1
-13.609883    1
 2.041964     1
 4.653225     1
-0.004355     1
 4.985366     1
-0.568622     1
-6.801628     1
 13.834793    1
 5.331480     1
-5.558432     1
-0.224928     1
 15.198958    1
 5.557594     1
 1.588474     1
-1.472406     1
 8.276817     1
 4.307008     1
 1.251480     1
 0.789823     1
 23.014984    1
-0.797743     1
-7.375602     1
 2.384779     1
 2.266627     1
 8.840321     1
 6.118562     1
-3.286584     1
 3.964779     1
-30.050074    1
 2.613123     1
 4.423518     1
 14.854779    1
-22.230470    1
 8.509442     1
 1.809735     1
-8.164546     1
 5.446817     1
 0.112932     1
 9.296040     1
-0.229781     1
 1.810996     1
-7.939986     1
 3.172741     1
 0.564691     1
 3.850703     1
-4.310470     1
 8.732360     1
-3.630954     1
 4.199442     1
 0.116524     1
 3.405938     1
 13.158474    1
 6.351084     1
-9.749986     1
 5.217492     1
 3.404677     1
 1.588753     1
 1.705072     1
 13.154295    1
-2.038916     1
-3.285514     1
 3.058078     1
 10.320219    1
-14.855705    1
-4.650954     1
 2.039251     1
 10.776143    1
 2.041978     1
 9.978181     1
 4.191671     1
-6.808329     1
-20.751731    1
 6.233225     1
-3.287068     1
 11.003518    1
-1.247068     1
-3.401526     1
 3.290703     1
-6.805016     1
 8.502448     1
 9.636040     1
 6.232946     1
 4.312257     1
 3.064486     1
-6.689986     1
 1.357301     1
 5.674486     1
 36.969251    1
-5.787552     1
 5.559735     1
-0.795412     1
 4.540894     1
-7.826100     1
 1.704398     1
 5.559149     1
 2.377008     1
-1.132992     1
 0.222360     1
 0.564486     1
 7.482741     1
 4.645366     1
 19.733225    1
 7.252550     1
-15.085118    1
 6.694193     1
 20.180117    1
 4.424486     1
 11.116817    1
-7.479400     1
-1.478520     1
 9.297594     1
-1.700763     1
-10.208227    1
-4.421628     1
-0.795910     1
-2.950954     1
 2.385747     1
 22.336729    1
-9.867068     1
 4.421964     1
-8.390265     1
 10.890219    1
 4.540512     1
 1.812155     1
-5.787655     1
 0.448856     1
 8.386920     1
 6.577301     1
 1.356715     1
 11.231187    1
-17.807743    1
-2.037552     1
 1.924970     1
-7.824546     1
 12.022741    1
 7.367594     1
 1.585454     1
-0.794839     1
-5.674062     1
 1.252155     1
 6.464486     1
-0.912112     1
-6.688711     1
-0.114546     1
 2.952829     1
 4.195263     1
-17.801042    1
 8.960996     1
 14.851099    1
-7.480367     1
 15.985468    1
-0.796584     1
 6.458767     1
 2.271378     1
 3.060410     1
 1.819149     1
-1.582992     1
-5.325998     1
-8.842699     1
-4.192992     1
 1.357008     1
 5.669823     1
 1.138562     1
 30.619456    1
-12.019679    1
-3.400954     1
 2.385454     1
-3.288329     1
 2.267785     1
-21.095221    1
 0.001084     1
 2.044588     1
 5.898665     1
 9.184486     1
 12.700996    1
-7.366100     1
-21.432992    1
-3.968138     1
 4.761392     1
-5.101056     1
-14.515030    1
 11.680321    1
 25.178474    1
 5.212550     1
 0.001671     1
 1.135161     1
 2.039442     1
 5.445072     1
 8.728269     1
 14.850996    1
-2.384253     1
 7.374588     1
 6.580894     1
 2.382932     1
-3.406100     1
 6.345556     1
-2.494136     1
-2.725323     1
 10.202741    1
 0.111964     1
 10.998870    1
-1.586877     1
-1.131731     1
 3.633225     1
-0.109209     1
 8.160615     1
 8.500028     1
 12.817404    1
 2.944105     1
-5.331144     1
 4.418856     1
 2.037683     1
 2.267404     1
 8.849735     1
-0.231335     1
-2.263960     1
 7.482257     1
 2.037404     1
 8.053327     1
 1.702257     1
 7.146436     1
 1.811862     1
-1.928622     1
 12.021480    1
 3.628753     1
-0.564546     1
 2.718372     1
 14.405556    1
 25.060805    1
 1.816817     1
-0.680954     1
-8.273183     1
 24.381201    1
 7.939046     1
-5.105412     1
 12.698767    1
 4.422155     1
 3.402741     1
 2.722741     1
-14.289883    1
 1.136231     1
-7.375132     1
-2.377450     1
 6.126627     1
 8.506817     1
-1.700851     1
 1.249823     1
-1.246379     1
 6.800996     1
 1.136524     1
 18.029149    1
 2.383034     1
-10.088725    1
-1.364546     1
 9.074193     1
 0.338855     1
-3.746394     1
-13.835616    1
-5.551628     1
 15.532843    1
-2.043183     1
 3.402932     1
-4.312685     1
 1.023811     1
 11.455849    1
 2.493900     1
 1.587111     1
-6.012010     1
 10.890894    1
 1.702155     1
 6.574398     1
-4.425602     1
 3.176817     1
-1.363960     1
 18.027315    1
-0.902406     1
 0.568562     1
-2.611628     1
-1.583080     1
-1.929972     1
 0.908665     1
 3.292932     1
-4.531922     1
 2.830894     1
 3.177008     1
 1.243811     1
-1.244723     1
-10.776877    1
 10.655556    1
 7.488372     1
 4.304882     1
 18.366627    1
 17.576245    1
 3.061862     1
 2.159046     1
 16.553430    1
 14.743430    1
-7.253285     1
-4.993373     1
 7.034970     1
 4.305072     1
 13.943137    1
-4.191335     1
 9.860908     1
 6.241378     1
-0.911438     1
 4.990703     1
 4.419060     1
-2.040470     1
 10.542257    1
 3.627492     1
 7.026920     1
 0.338474     1
 8.280028     1
 1.130307     1
 2.153123     1
 14.738576    1
-2.151247     1
 3.741378     1
 4.874486     1
-2.491144     1
 3.738078     1
 0.564779     1
 7.821084     1
 3.406040     1
 0.230410     1
 0.910894     1
 8.395263     1
-24.269004    1
 11.341773    1
 1.358958     1
 6.806627     1
 7.366817     1
 13.607888    1
 10.091671    1
 0.453606     1
 9.860703     1
-1.929004     1
 8.164486     1
-5.552317     1
 3.061671     1
 17.237697    1
 9.753518     1
-2.724546     1
 12.931099    1
 2.831378     1
-4.532215     1
 11.684970    1
-6.690074     1
-16.667157    1
 5.787594     1
 3.739823     1
-3.627068     1
 8.047785     1
-4.082992     1
 1.356729     1
 0.794882     1
-25.970558    1
-13.491145    1
-0.223183     1
 6.240703     1
 1.365161     1
-1.360954     1
 3.289530     1
-0.003857     1
 1.357976     1
 6.240321     1
 0.452843     1
 8.167976     1
-10.777157    1
 0.562345     1
-7.256100     1
 0.450117     1
 1.133811     1
 25.513532    1
 1.590791     1
 4.878665     1
 3.856627     1
-0.227259     1
-2.156482     1
-9.075221     1
-1.361042     1
 17.345072    1
 2.382741     1
 4.760894     1
-3.742010     1
-1.929106     1
 1.244295     1
 10.314588    1
-10.089209    1
 2.832155     1
 0.569339     1
 14.516817    1
-9.752992     1
 7.032345     1
 1.359046     1
 0.335938     1
 3.851582     1
 0.228855     1
 18.827301    1
 11.344779    1
-2.269195     1
 9.982066     1
-4.989400     1
 23.024105    1
 1.360117     1
-1.698622     1
-41.280470    1
-2.040177     1
-1.704928     1
-2.947259     1
 11.003239    1
-5.444444     1
 3.970703     1
 4.990894     1
 2.154779     1
-9.409883     1
-2.946086     1
 2.832257     1
-2.607845     1
 6.012932     1
 9.864588     1
 2.382155     1
 1.018665     1
-5.778329     1
 7.032932     1
-0.682112     1
-9.861247     1
-2.834444     1
 0.227301     1
 7.488562     1
-0.341145     1
-9.414253     1
-2.377640     1
-6.007068     1
 4.873225     1
 3.518372     1
 1.129823     1
 5.900894     1
-6.237068     1
 6.125072     1
 1.364677     1
 10.888269    1
-4.757845     1
 4.646040     1
-1.134928     1
 29.252066    1
 10.656627    1
-14.287361    1
 4.992550     1
-6.007450     1
 13.154970    1
-0.568813     1
 11.562755    1
-10.093960    1
 6.012066     1
 2.603327     1
 2.158078     1
 3.178870     1
 8.396333     1
 4.761568     1
 8.843225     1
 8.618665     1
 17.231964    1
 5.216715     1
 9.868474     1
-7.027068     1
 3.743416     1
 4.766715     1
 3.741084     1
 7.139339     1
 14.066143    1
 5.448078     1
 3.514295     1
 5.212741     1
 2.607008     1
-2.268520     1
 10.886627    1
-1.244839     1
-1.809883     1
 10.087785    1
 6.238372     1
 10.774002    1
 2.272932     1
 14.290703    1
-9.075323     1
 2.038753     1
 11.003811    1
 2.046040     1
 6.800600     1
 15.653034    1
 4.991289     1
 5.668958     1
 7.148181     1
-0.002112     1
 7.366436     1
 0.230996     1
 10.887785    1
 3.401876     1
 6.010117     1
 16.673034    1
 4.759442     1
-2.608622     1
 6.232932     1
 13.267888    1
 2.611378     1
 4.766524     1
 2.381084     1
 3.853416     1
-4.989297     1
-5.327171     1
 11.791773    1
 2.265161     1
 14.175086    1
-7.825998     1
 14.176157    1
-1.362992     1
 7.485747     1
 1.133900     1
-3.060367     1
-8.056100     1
 5.212448     1
 2.491773     1
 3.170307     1
 5.897008     1
 10.884882    1
 2.272639     1
 3.400410     1
-0.564253     1
 8.390233     1
-1.025030     1
 4.994105     1
 4.311187     1
-3.511042     1
-1.250661     1
 3.289046     1
-0.563271     1
-3.179883     1
 4.990117     1
 10.661289    1
 3.290600     1
 7.594970     1
 2.045747     1
 6.009046     1
 6.239149     1
-2.951628     1
 12.251773    1
-0.451717     1
 3.283709     1
 2.832843     1
 1.591862     1
 1.244588     1
 2.151378     1
 2.950512     1
 12.361582    1
 7.139633     1
 0.110131     1
 6.807888     1
 2.381582     1
 0.906040     1
 5.778665     1
 12.817697    1
 13.382550    1
 0.910117     1
 4.994384     1
-10.995132    1
 2.832741     1
 2.950410     1
 11.113811    1
-6.916775     1
 7.029823     1
 7.822448     1
-1.699004     1
 4.985263     1
 3.969544     1
 3.970219     1
 7.827404     1
 13.720321    1
-3.285221     1
 0.339060     1
 0.793811     1
 2.725072     1
 11.907799    1
 47.511303    1
 3.401289     1
 5.441378     1
 8.842448     1
 9.524105     1
-6.353476     1
 0.109149     1
 3.289149     1
 8.500117     1
-10.549400    1
 10.998665    1
-3.858329     1
-8.848241     1
 3.970117     1
-8.613183     1
 9.862843     1
-3.397552     1
 3.852448     1
 2.268269     1
-7.029679     1
 10.994970    1
 1.136817     1
 3.058269     1
-3.289590     1
 2.945747     1
 4.762639     1
 8.509149     1
 10.548078    1
 2.950600     1
 7.602257     1
 2.498474     1
 5.441582     1
 1.362345     1
 8.280703     1
 13.720219    1
 21.207697    1
 10.550703    1
 3.514193     1
 1.705454     1
-1.811540     1
-14.510470    1
 5.901378     1
 10.660703    1
-2.833769     1
 7.597888     1
 2.494384     1
-5.896775     1
 12.474588    1
-4.079693     1
-7.599590     1
-1.251233     1
 4.194486     1
 11.340512    1
 1.811582     1
 9.415175     1
 3.175659     1
-0.336291     1
 9.076333     1
 4.083225     1
-1.022010     1
 7.140703     1
 0.230894     1
 1.698181     1
 5.216436     1
 5.219149     1
 9.753225     1
 10.550117    1
 6.689823     1
-1.360470     1
 7.708958     1
-2.494839     1
-1.135602     1
-2.490558     1
 14.857594    1
-8.392406     1
 0.336129     1
-0.001438     1
 2.489823     1
-3.858711     1
-4.079590     1
-4.647361     1
-2.268124     1
 6.353327     1
-1.929590     1
 4.762169     1
-5.670074     1
 6.920512     1
 4.419926     1
 6.468269     1
 4.989251     1
 0.902639     1
 22.910615    1
 3.518665     1
-6.578520     1
-0.456673     1
 0.116627     1
 7.482066     1
-3.514928     1
-2.152889     1
 0.791480     1
-1.478036     1
 4.200512     1
-3.745807     1
 2.945849     1
 4.313811     1
 7.034486     1
 8.962155     1
-4.648227     1
 4.758665     1
 4.191378     1
 3.744970     1
-1.932112     1
 3.740117     1
 16.666436    1
 33.682066    1
-5.780851     1
-3.172596     1
 2.038283     1
 12.812257    1
-2.039400     1
 5.101671     1
 3.065849     1
 14.855556    1
 6.460512     1
-6.235030     1
 8.052550     1
-1.819004     1
 9.643225     1
 10.316627    1
 7.028283     1
 6.465747     1
 14.856538    1
 5.328562     1
-3.745221     1
 1.924105     1
-3.060558     1
 6.232360     1
-0.453769     1
 6.014677     1
 2.952448     1
-0.223271     1
 0.562155     1
 1.929735     1
 20.975366    1
-10.435221    1
 7.600996     1
 9.640600     1
 5.441862     1
 9.071773     1
-2.719679     1
-6.573183     1
 0.571964     1
 1.814295     1
-1.136775     1
 7.480894     1
 1.697976     1
-8.273373     1
-3.062508     1
 5.559442     1
 15.086245    1
 6.807404     1
 3.626436     1
 1.356817     1
 16.781201    1
 6.578283     1
-0.229004     1
 6.120219     1
 5.214677     1
 1.698562     1
 5.330894     1
 1.362155     1
 1.705556     1
 1.016817     1
 48.538386    1
 5.672741     1
 2.717199     1
 5.780307     1
 13.497990    1
-5.441321     1
 3.971964     1
 1.478958     1
-1.242699     1
 1.022741     1
-2.717845     1
 7.713532     1
-0.116966     1
-10.089795    1
 7.368562     1
-0.452024     1
-5.672992     1
 2.606627     1
 1.586920     1
-8.393183     1
-9.071922     1
 2.839442     1
 2.609060     1
-14.284444    1
 0.562550     1
-0.343285     1
 1.017008     1
-11.338036    1
-1.475030     1
 3.858856     1
 5.447008     1
 4.539735     1
 0.902550     1
 3.973137     1
 4.652550     1
 4.652448     1
-14.515323    1
-5.438138     1
 0.681964     1
-3.401438     1
-0.229297     1
 5.557199     1
 6.005570     1
-0.566394     1
 4.648562     1
 14.291187    1
-3.064839     1
 9.296627     1
 3.518181     1
-1.704253     1
 5.329530     1
 9.643709     1
 7.255849     1
 20.983137    1
 1.246715     1
-14.398813    1
 17.580512    1
 1.814882     1
 6.578665     1
-10.208520    1
 10.771773    1
 6.012155     1
-4.536775     1
-16.673769    1
-18.940954    1
 9.302066     1
-3.738520     1
 4.079046     1
-9.188622     1
 0.454677     1
 6.462639     1
 3.742257     1
 3.968856     1
-23.477640    1
 0.905263     1
-12.243857    1
-1.701233     1
 0.566817     1
 9.522360     1
-4.421540     1
 5.444486     1
-0.116100     1
 2.604970     1
 3.628474     1
 14.511773    1
 2.722639     1
-5.211335     1
 4.085938     1
 0.230600     1
 2.040410     1
 10.655747    1
-8.613373     1
 2.607888     1
 7.484779     1
-3.965616     1
 19.732155    1
-2.950470     1
 6.232448     1
 14.969149    1
 1.477199     1
-0.000661     1
 4.425747     1
-4.767068     1
-3.965221     1
 0.798181     1
 0.117785     1
 3.062741     1
 2.610894     1
-1.359106     1
 18.486436    1
 26.305175    1
 0.231480     1
 5.329633     1
 2.491378     1
 34.017799    1
 2.718856     1
 7.368767     1
 13.044588    1
-9.638036     1
 0.223416     1
-3.968329     1
 16.671964    1
 0.116715     1
-1.246863     1
 6.462257     1
 8.732843     1
 1.016333     1
-0.789106     1
 2.839251     1
 1.930512     1
 6.008856     1
 0.342257     1
 5.667785     1
 6.460996     1
 15.310512    1
-3.065030     1
 5.783518     1
 7.596817     1
-14.519297    1
-4.653183     1
-6.574355     1
 12.023225    1
 6.240117     1
-0.225412     1
-1.819195     1
 0.792741     1
 6.348855     1
 4.537594     1
 11.116729    1
 4.762550     1
 12.359060    1
 6.350410     1
 4.311964     1
-2.383373     1
 0.797008     1
-4.649693     1
 4.648855     1
 5.444588     1
-7.252992     1
-1.704546     1
 9.183225     1
 1.701862     1
-2.834253     1
 4.080600     1
 2.831876     1
 5.105454     1
 3.397888     1
 9.411964     1
-7.034151     1
 1.478269     1
 2.263034     1
-29.025793    1
 3.741187     1
-5.438725     1
 7.143518     1
 16.898474    1
 1.699823     1
 9.861187     1
-1.475514     1
 6.914016     1
 12.136040    1
-2.722508     1
 1.130996     1
 1.024779     1
 1.930117     1
-7.253476     1
-4.762303     1
-5.669106     1
 10.994779    1
 5.555263     1
 6.688855     1
 2.265072     1
-3.063094     1
-0.909004     1
-3.065132     1
 3.510894     1
-2.605221     1
 6.354588     1
 0.570894     1
 0.452257     1
 3.853518     1
 1.813811     1
-1.249297     1
 22.910321    1
 0.681187     1
-1.924151     1
-15.308916    1
 2.604295     1
-6.572992     1
-15.192201    1
-3.969883     1
 4.654002     1
-8.047845     1
 5.325454     1
 2.950014     1
-0.905221     1
 2.495938     1
 6.119251     1
 4.080703     1
 4.085366     1
-19.734620    1
 5.673811     1
 7.714588     1
 1.925454     1
 8.621378     1
-1.926482     1
 6.578474     1
-3.629297     1
-4.871335     1
 8.047404     1
 4.874779     1
 8.052741     1
 7.594779     1
 2.151084     1
 12.925072    1
-8.505221     1
-11.000954    1
-15.192303    1
-7.713476     1
 2.495366     1
-0.458329     1
-6.694737     1
 4.765747     1
 6.467785     1
 0.337594     1
 8.620894     1
 10.434295    1
 15.765570    1
 2.044486     1
-3.971922     1
-0.908916     1
 32.774603    1
 4.653709     1
 3.291378     1
 0.455263     1
 8.280410     1
 10.318958    1
-3.062992     1
 10.087990    1
 7.374779     1
 2.495175     1
 13.270117    1
-1.927655     1
-0.002508     1
 3.406920     1
 7.256817     1
 6.806920     1
 18.485072    1
-0.798227     1
-1.131628     1
-0.223667     1
 1.582639     1
-1.358036     1
-1.703667     1
-0.904737     1
 5.326627     1
 19.391303    1
 7.829149     1
-0.683769     1
-1.925323     1
 5.218372     1
-3.510763     1
-5.896379     1
-3.293373     1
 2.380028     1
-7.821247     1
 6.575072     1
 0.002639     1
-4.992215     1
 3.855747     1
-8.048520     1
-0.225030     1
-2.151922     1
 3.744105     1
-3.740763     1
-0.448916     1
 3.971582     1
-4.425132     1
-0.682406     1
 8.279544     1
-12.585807    1
-2.718622     1
 6.236524     1
 14.738870    1
-7.598227     1
-0.906775     1
 3.856333     1
 5.216920     1
-1.244048     1
 1.584486     1
 3.624970     1
 10.996054    1
 10.202448    1
-9.415030     1
 3.623900     1
-3.626100     1
 3.969339     1
 3.742448     1
 6.119149     1
 8.051671     1
 3.284398     1
 1.246333     1
 2.264779     1
 6.695175     1
 4.535454     1
 7.706729     1
 0.452345     1
 5.329149     1
-2.386291     1
 10.201378    1
-3.631922     1
 0.453709     1
Name: wt82_71, dtype: int64

Y = data['wt82_71']

The confounding variables (X) (variables that influence both the independent variables and dependent variable causing spurious correlations) will be:

cf = ["active",          # Measure of daily activity on 1971.
      "age",             # Age in 1971.
      "education",       # Amount of education in 1971: from 8th-grade to college level education.
      "exercise",        # Measure of recreational excercise.
      "race",            # Caucasian or not.
      "sex",             # Female or male.
      "smokeintensity",  # Number of Cigarettes smoked per day in 1971.  
      "smokeyrs",        # Years of smoking.
      "wt71"             # Weight in Kilograms in 1971.
    
]
X = data[cf]

print(X.shape)
X.head()

(1566, 9)

X['active'].value_counts()

1    715
0    702
2    149
Name: active, dtype: int64

X['exercise'].value_counts()

1    661
2    605
0    300
Name: exercise, dtype: int64

X['education'].value_counts()

3    637
2    340
1    291
5    177
4    121
Name: education, dtype: int64

Distribution of the outcome in relation to the control and treatment groups

data[data.qsmk==0].wt82_71.count()

1163

data[data.qsmk==1].wt82_71.count()

403

We have a very imbalanced dataset with 1163 samples for the control group and 403 samples for the treatment group

sns.kdeplot(data.loc[lambda df: df.qsmk==0].wt82_71, label='control')
sns.kdeplot(data.loc[lambda df: df.qsmk==1].wt82_71, label='treatment')

<matplotlib.axes._subplots.AxesSubplot at 0x7fc733125470>

sns.distplot(data[data.qsmk==1]['wt82_71'], bins=20)

<matplotlib.axes._subplots.AxesSubplot at 0x7f1c2f6a7f60>

sns.distplot(data[data.qsmk==0]['wt82_71'], bins=20)

<matplotlib.axes._subplots.AxesSubplot at 0x7f1c2f6b44a8>

data.groupby('qsmk')['wt82_71'].plot(kind='hist', bins=20, alpha=0.8, legend=True)

qsmk
0    AxesSubplot(0.125,0.125;0.775x0.755)
1    AxesSubplot(0.125,0.125;0.775x0.755)
Name: wt82_71, dtype: object

data.groupby('qsmk')['wt82_71'].agg(['mean', 'median'])

The mean weight gain value for the treatment group is greater than the mean value for the control group. Could we think that the treatment had a postive impact (more weight for subjects who quit smoking)? We have to remember that the dataset is imbalanced.

Distribution of several covariates in relation to the control and treatment groups

data.groupby('qsmk')['age'].plot(kind='hist', bins=20, alpha=0.8, legend=True)

qsmk
0    AxesSubplot(0.125,0.125;0.775x0.755)
1    AxesSubplot(0.125,0.125;0.775x0.755)
Name: age, dtype: object

data.groupby('qsmk')['education'].plot(kind='hist', bins=20, alpha=0.8, legend=True)

qsmk
0    AxesSubplot(0.125,0.125;0.775x0.755)
1    AxesSubplot(0.125,0.125;0.775x0.755)
Name: education, dtype: object

data.groupby('qsmk')['active'].plot(kind='hist', bins=20, alpha=0.5, legend=True)

qsmk
0    AxesSubplot(0.125,0.125;0.775x0.755)
1    AxesSubplot(0.125,0.125;0.775x0.755)
Name: active, dtype: object

data.groupby('qsmk')['exercise'].plot(kind='hist', bins=20, alpha=0.5, legend=True)

qsmk
0    AxesSubplot(0.125,0.125;0.775x0.755)
1    AxesSubplot(0.125,0.125;0.775x0.755)
Name: exercise, dtype: object

Causal Inference Analysis

Potential Outcomes

Y = Y(0) when A = 0

Y = Y(1) when A = 1

Goal

We want to estimate the ATE so the real ATE measured in a Randomized Controlled Trial (interventional distribution) can be defined as:

Δreal = E[Y(1) - Y(0)]

But if we want to compute it from observational data:

Δobs = E[Y|A=1] - E[Y|A=0] = E[Y(1)|A=1] - E[Y(0)|A=0] <> Δreal

because

E[Y(i)|X=i] <> E[Y(i)].

Assumption

Ignorability assumption: The treatment (A) received by a sample is totally explained by the random variable(s) X and the output is independent from the treatment assignment A.

Y(0), Y(1) ⊥ A|X

P(A,Y(0),Y(1)|X) = P(A|X)*P(Y(0),Y(1)|X)

The treatment is assumed to be unconfounded in the sense that the dependence between the treatment assignment and the outcomes is only through something we observe, namely the covariates X.

ATE using Linear Model

We're going to assume that:

Y(0) = α+βX+ϵ

where ϵ (residual) represents the noise (Y(0) doesn't have the treatment variable A) and

Y(1) = Y(0)+γA

So we can estimate the ATE as:

Y = α+βX+γA

model = CausalModel(Y=data.wt82_71.values, D=data.qsmk.values, X=X.values)

The parameter adj indicates how covariate adjustments are to be performed. Set adj = 0 to not include any covariates. Set adj = 1 to include treatment indicator D and covariates X separately. Set adj = 2 to additionally include interaction terms between D and X. Defaults to 2.

model.est_via_ols(adj=1)

/home/eldar/.conda/envs/causal/lib/python3.6/site-packages/causalinference/estimators/ols.py:21: FutureWarning: `rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.
To use the future default and silence this warning we advise to pass `rcond=None`, to keep using the old, explicitly pass `rcond=-1`.
  olscoef = np.linalg.lstsq(Z, Y)[0]

print(model.estimates)

Treatment Effect Estimates: OLS

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      3.349      0.473      7.084      0.000      2.422      4.275

The estimated ATE is 3.349 (postive weight gain in relation to the treatment of giving up smoking). If our model decribes accurately the confounders X, we got a 95% confidence interval (If we repeat 100 times the experiment, the predicted ATE would be inside of that interval in 95 times. But the real ATE could be outside of that interval).

95 % Confidence Interval = AVG ± 1.96 * np.sqrt(data[data[A]==1].wt82_71.var()/data[data.[A]==1].shape[0] + data[data[A]==0].wt82_71.var()/data[data[A]==0].shape[0])

ATE via Matching

For each sample we're going to take the most similar one from the other ttreatment group (control) and we're going to estimate the ATE using them. To decide what's the meaning of the most similar one we're going to choose the Nearest neighbourg from the X covariate space, weighting the distances using the inverse of the variance of each covariate.

model = CausalModel(Y=data.wt82_71.values, D=data.qsmk.values, X=X.values)

model.est_via_matching()

print(model.estimates)

Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      3.612      0.760      4.751      0.000      2.122      5.102
           ATC      3.643      0.829      4.395      0.000      2.018      5.268
           ATT      3.523      0.860      4.096      0.000      1.837      5.208

ATC (Average Treatment Effect of the Control) = E[Y(1) - Y(0)|X=0]

ATT (Average Treatment Effect of the Treated) = E[Y(1) - Y(0)|X=1]

The parameter matches stands for the number of matches to use for each subject and the parameter bias_adj indicates whether bias adjustements should be attempted. https://www.ncbi.nlm.nih.gov/books/NBK424728/)

model.est_via_matching(weights='inv', matches=2)
print(model.estimates)

Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      3.499      0.634      5.519      0.000      2.256      4.741
           ATC      3.567      0.681      5.240      0.000      2.233      4.902
           ATT      3.301      0.668      4.941      0.000      1.991      4.610

model.est_via_matching(weights='inv', matches=2, bias_adj=True)

/home/eldar/.conda/envs/causal/lib/python3.6/site-packages/causalinference/estimators/matching.py:100: FutureWarning: `rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.
To use the future default and silence this warning we advise to pass `rcond=None`, to keep using the old, explicitly pass `rcond=-1`.
  return np.linalg.lstsq(X, Y)[0][1:]  # don't need intercept coef

print(model.estimates)

Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      3.686      0.633      5.822      0.000      2.445      4.927
           ATC      3.654      0.680      5.372      0.000      2.321      4.987
           ATT      3.781      0.666      5.676      0.000      2.475      5.086

Propensity

e(X) = P(A|X)

ATE via Inverse Propensity Score Weighting Estimator

We want to know the value of the Potential Outcomes E[Y(i)] but using observational data we can only have access to E[Y(i)| X=i].

The probability of the Potential Outcomes can be defined as:

P(Y(i)) = P(Y(i)|X=i)P(X=i)

So we could think that we could compute:

$$E(Y(i)) = E[\frac{yi}{P(A|X)}P(A|X)]$$

If we multiply each sample times its inverse propensity score we could compute the Potential Outcomes.

Inverse Propensity Score Weighting Estimator

$$ΔIPSW = 1/N * (\sum_{i=Treatment} \frac{yi}{e(Xi)} - \sum_{i=Control} \frac{yi}{1-e(Xi)})$$

model = CausalModel(Y=data.wt82_71.values, D=data.qsmk.values, X=X.values)

The est_propensity_s() method applies a Logistic Regression over the covariates.

model.est_propensity_s()

propensity = model.propensity['fitted']

df = data
df['ips'] = np.where(df.qsmk == 1, 1/propensity, 1/(1-propensity))

df['ipsw'] = df.wt82_71 * df['ips']

ipswe = (df[df.qsmk==1]['ipsw'].sum() - df[df.qsmk==0]['ipsw'].sum())/df.shape[0]

print('The estimated ATE computed using IPSWE is: ' + str(ipswe))

The estimated ATE computed using IPSWE is: 3.555669650974593

Overlap of Treatment and Control Groups in the Covariates Space

model = CausalModel(Y=data.wt82_71.values, D=data.qsmk.values, X=X.values)

print(model.summary_stats)

Summary Statistics

                      Controls (N_c=1163)         Treated (N_t=403)             
       Variable         Mean         S.d.         Mean         S.d.     Raw-diff
--------------------------------------------------------------------------------
              Y        1.984        7.449        4.525        8.748        2.541

                      Controls (N_c=1163)         Treated (N_t=403)             
       Variable         Mean         S.d.         Mean         S.d.     Nor-diff
--------------------------------------------------------------------------------
             X0        0.632        0.642        0.690        0.662        0.089
             X1       42.788       11.792       46.174       12.215        0.282
             X2        2.687        1.153        2.794        1.279        0.088
             X3        1.175        0.743        1.251        0.708        0.104
             X4        0.146        0.353        0.089        0.286       -0.177
             X5        0.534        0.499        0.454        0.499       -0.160
             X6       21.192       11.476       18.603       12.402       -0.217
             X7       24.088       11.708       26.032       12.743        0.159
             X8       70.303       15.176       72.355       15.629        0.133

Large values indicate that simple linear adjustment methods may not be adequate for removing biases that are associated with differences in covariates.

ATE via Doubly Robust Weighted Estimator

We're going to estimate the ATE computing a weighted Linear Regression, weighting each sample using the inverse propensity score.

model = CausalModel(Y=data.wt82_71.values, D=data.qsmk.values, X=X.values)

model.est_propensity_s()
model.est_via_weighting()

/home/eldar/.conda/envs/causal/lib/python3.6/site-packages/causalinference/estimators/weighting.py:23: FutureWarning: `rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.
To use the future default and silence this warning we advise to pass `rcond=None`, to keep using the old, explicitly pass `rcond=-1`.
  wlscoef = np.linalg.lstsq(Z_w, Y_w)[0]

print(model.estimates)

Treatment Effect Estimates: Weighting

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      3.877      0.482      8.036      0.000      2.932      4.823

ATE via Trimming

The trim_s() method is going to remove samples whose estimated propensity score <= cutoff or >= (1 - cutoff).

model = CausalModel(Y=data.wt82_71.values, D=data.qsmk.values, X=X.values)

model.est_propensity_s()
model.trim_s()
model.est_via_matching()

print(model.estimates)

Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      3.689      0.768      4.801      0.000      2.183      5.194
           ATC      3.817      0.833      4.582      0.000      2.184      5.449
           ATT      3.342      0.880      3.797      0.000      1.617      5.067

propensity = model.propensity['fitted']
cutoff = model.cutoff

print(propensity)

[0.09731427 0.14588255 0.14901795 ... 0.12258264 0.46821307 0.15911705]

print(cutoff)

0.09465076523563337

mask = (propensity > cutoff) & (propensity < 1 - cutoff)

data[mask].plot.scatter(x='active', y='wt82_71', c='qsmk', cmap='rainbow', colorbar=True)

<matplotlib.axes._subplots.AxesSubplot at 0x7f1c2efdb278>

ATE via Stratification

If we don't have a prior knowledge about what 'similar samples' could be, we can group these samples inside clusters of similar propensity scores. After that, we can compute the ATE of each cluster of subjects.

model = CausalModel(Y=data.wt82_71.values, D=data.qsmk.values, X=X.values)

model.est_propensity_s()
model.stratify_s()

print(model.strata)

Stratification Summary

              Propensity Score         Sample Size     Ave. Propensity   Outcome
   Stratum      Min.      Max.  Controls   Treated  Controls   Treated  Raw-diff
--------------------------------------------------------------------------------
         1     0.033     0.127       179        18     0.093     0.096     2.147
         2     0.127     0.172       167        29     0.149     0.152     4.979
         3     0.172     0.243       302        89     0.209     0.212     3.742
         4     0.243     0.281       152        44     0.261     0.263     6.632
         5     0.281     0.332       130        65     0.303     0.303     1.482
         6     0.332     0.393       126        70     0.361     0.361     3.631
         7     0.393     0.450        65        33     0.417     0.418     2.358
         8     0.450     0.508        28        21     0.473     0.480     3.195
         9     0.509     0.865        14        34     0.579     0.590     1.466

We can compute the overall ATE from the different strata using the est_via_blocking method. To achive that, this function estimates average treatment effects using regression within blocks.

model.est_via_blocking()

/home/eldar/.conda/envs/causal/lib/python3.6/site-packages/causalinference/estimators/ols.py:21: FutureWarning: `rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.
To use the future default and silence this warning we advise to pass `rcond=None`, to keep using the old, explicitly pass `rcond=-1`.
  olscoef = np.linalg.lstsq(Z, Y)[0]

print(model.estimates)

Treatment Effect Estimates: Blocking

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      3.625      0.482      7.515      0.000      2.680      4.571
           ATC      3.667      0.511      7.177      0.000      2.666      4.668
           ATT      3.505      0.466      7.517      0.000      2.591      4.418

	seqn	qsmk	death	yrdth	modth	dadth	sbp	dbp	sex	age	race	income	marital	school	education	ht	wt71	wt82	wt82_71	birthplace	smokeintensity	smkintensity82_71	smokeyrs	asthma	bronch	tb	hf	hbp	pepticulcer	colitis	hepatitis	chroniccough	hayfever	diabetes	polio	tumor	nervousbreak	alcoholpy	alcoholfreq	alcoholtype	alcoholhowmuch	pica	headache	otherpain	weakheart	allergies	nerves	lackpep	hbpmed	boweltrouble	wtloss	infection	active	exercise	birthcontrol	pregnancies	cholesterol	hightax82	price71	price82	tax71	tax82	price71_82	tax71_82
count	1629.000000	1629.000000	1629.000000	318.000000	322.000000	322.000000	1552.000000	1548.000000	1629.000000	1629.000000	1629.000000	1567.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.00000	1566.000000	1566.000000	1537.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1212.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	1629.000000	726.00000	1613.000000	1537.000000	1537.000000	1537.000000	1537.000000	1537.000000	1537.000000	1537.000000
mean	16552.364641	0.262738	0.195212	87.569182	6.257764	15.872671	128.709407	77.744832	0.509515	43.915285	0.131983	17.947671	2.503376	11.135052	2.703499	168.740965	71.05213	73.469219	2.638300	31.595316	20.551258	-4.737876	24.871087	0.048496	0.085328	0.014119	0.004911	1.050952	0.103745	0.033763	0.017188	0.054021	0.089626	0.979742	0.014119	0.023327	0.028852	0.875998	1.920196	2.475752	3.287129	0.975445	0.629834	0.246163	0.022099	0.062001	0.144260	0.050952	1.005525	1.034991	0.025783	0.147944	0.651934	1.195212	1.084715	3.69146	219.973962	0.165908	2.138750	1.806095	1.058581	0.505983	0.332741	0.552614
std	7498.918195	0.440256	0.396485	2.659415	3.615304	8.905488	19.051560	10.634864	0.500063	12.170430	0.338576	2.663277	1.082367	3.089601	1.190098	9.053131	15.72959	16.158047	7.879913	14.500500	11.803754	13.741360	12.198072	0.214878	0.279456	0.118018	0.069928	0.958209	0.305023	0.180674	0.130013	0.226128	0.285732	0.995793	0.118018	0.150987	0.167442	0.338873	1.307140	1.208155	2.984695	0.997853	0.482997	0.430907	0.147052	0.241232	0.351461	0.219966	0.982947	0.967216	0.158535	0.355153	0.652742	0.739348	0.947747	2.20560	45.444202	0.372119	0.229016	0.130641	0.216229	0.111894	0.155045	0.150321
min	233.000000	0.000000	0.000000	83.000000	1.000000	1.000000	87.000000	47.000000	0.000000	25.000000	0.000000	11.000000	2.000000	0.000000	1.000000	142.875000	36.17000	35.380205	-41.280470	1.000000	1.000000	-80.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.00000	78.000000	0.000000	1.506592	1.451904	0.524902	0.219971	-0.202698	0.035995
25%	10607.000000	0.000000	0.000000	85.000000	3.000000	8.000000	116.000000	70.000000	0.000000	33.000000	0.000000	17.000000	2.000000	10.000000	2.000000	161.781250	59.65000	61.688562	-1.478399	22.000000	10.000000	-10.000000	15.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000	1.000000	2.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	2.00000	189.000000	0.000000	2.036621	1.739990	0.944946	0.439941	0.200989	0.460999
50%	20333.000000	0.000000	0.000000	88.000000	6.000000	15.500000	126.000000	77.000000	1.000000	44.000000	0.000000	19.000000	2.000000	12.000000	3.000000	168.281250	69.40000	72.121187	2.603811	34.000000	20.000000	-1.000000	24.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	2.000000	3.000000	2.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000	0.000000	0.000000	1.000000	1.000000	1.000000	3.00000	216.000000	0.000000	2.167969	1.814941	1.049805	0.505981	0.335999	0.543945
75%	22719.000000	1.000000	0.000000	90.000000	10.000000	24.000000	140.000000	85.000000	1.000000	53.000000	0.000000	20.000000	2.000000	12.000000	3.000000	175.375000	79.95000	83.460996	6.689581	42.000000	30.000000	1.000000	33.000000	0.000000	0.000000	0.000000	0.000000	2.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2.000000	0.000000	0.000000	0.000000	1.000000	3.000000	4.000000	4.000000	2.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2.000000	2.000000	0.000000	0.000000	1.000000	2.000000	2.000000	5.00000	245.000000	0.000000	2.241699	1.867676	1.154785	0.571899	0.443787	0.621948
max	25061.000000	1.000000	1.000000	92.000000	12.000000	31.000000	229.000000	130.000000	1.000000	74.000000	1.000000	22.000000	8.000000	17.000000	5.000000	198.093750	169.19000	136.531303	48.538386	56.000000	80.000000	50.000000	64.000000	1.000000	1.000000	1.000000	1.000000	2.000000	1.000000	1.000000	1.000000	1.000000	1.000000	2.000000	1.000000	1.000000	1.000000	2.000000	5.000000	4.000000	48.000000	2.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	2.000000	2.000000	1.000000	1.000000	2.000000	2.000000	2.000000	15.00000	416.000000	1.000000	2.692871	2.103027	1.522461	0.747925	0.612061	0.884399

	seqn	death	yrdth	modth	dadth	sbp	dbp	sex	age	race	income	marital	school	education	ht	wt71	wt82	wt82_71	birthplace	smokeintensity	smkintensity82_71	smokeyrs	hbp	pepticulcer	hayfever	diabetes	tumor	alcoholpy	alcoholfreq	alcoholtype	alcoholhowmuch	headache	otherpain	weakheart	nerves	hbpmed	boweltrouble	infection	active	exercise	birthcontrol	pregnancies	cholesterol	price71	price82	tax71	tax82	price71_82	tax71_82
0	233	0	NaN	NaN	NaN	175.0	96.0	0	42	1	19.0	2	7	1	174.1875	79.04	68.946040	-10.093960	47.0	30	-10	29	1	1	0	1	0	1	1	3	7.0	1	0	0	0	1	0	0	0	2	2	NaN	197.0	2.183594	1.739990	1.102295	0.461975	0.443787	0.640381
1	235	0	NaN	NaN	NaN	123.0	80.0	0	36	0	18.0	2	9	2	159.3750	58.63	61.234970	2.604970	42.0	20	-10	24	0	0	0	0	0	1	0	1	4.0	1	0	0	0	0	0	1	0	0	2	NaN	301.0	2.346680	1.797363	1.364990	0.571899	0.549316	0.792969
2	244	0	NaN	NaN	NaN	115.0	75.0	1	56	1	15.0	3	11	2	168.5000	56.81	66.224486	9.414486	51.0	20	-14	26	0	0	1	0	1	1	3	4	NaN	1	1	0	1	0	0	0	0	2	0	2.0	157.0	1.569580	1.513428	0.551270	0.230988	0.056198	0.320251
3	245	1	85.0	2.0	14.0	148.0	78.0	0	68	1	15.0	3	5	1	170.1875	59.42	64.410117	4.990117	37.0	3	4	53	1	0	0	0	0	1	2	3	4.0	0	1	1	0	0	0	0	1	2	2	NaN	174.0	1.506592	1.451904	0.524902	0.219971	0.054794	0.304993
4	252	0	NaN	NaN	NaN	118.0	77.0	0	40	0	18.0	2	11	2	181.8750	87.09	92.079251	4.989251	42.0	20	0	19	0	0	0	0	0	1	2	1	2.0	1	0	0	0	0	1	0	1	1	2	NaN	216.0	2.346680	1.797363	1.364990	0.571899	0.549316	0.792969