/*************************************************************
**
**   Prepared by: Adam Ross Nelson JD PhD
**                https://github.com/adamrossnelson
**                https://twitter.com/adamrossnelson
**                July 2020
**
**   Purpose:     Demonstrate State-Python Code Crosswalk
**                State-Python Code Rosetta Stone
**
***************************************************************/

cls
set more off
clear all
capture log close
log using my_logfile.txt, text replace

sysuse auto

python
from sfi import Data          # Import Stata's Python API
import pandas as pd           # Import Pandas, Popular DataFrame module
import numpy as np            # Import Numpy, Popular scientific computing module

# Display parameters that will approximate State output
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

# Store dataset variable names for later reference
vars = pd.DataFrame(Data.getAsDict()).columns
vars = [Data.getVarName(x) for x in range(0,Data.getVarCount())]

# Store dataset in a Pandas dataframe
df = pd.DataFrame(Data.getAsDict())
df.head()

df['rep78'].describe().transpose()

# Store dataset in a Pandas dataframe, manage missing and value labels
df = pd.DataFrame(Data.getAsDict(valuelabel=True, 
                                 missingval=np.nan))
df.head()

end

// Loop through each variable - Stata
foreach var of varlist * {
     di "`var'”
}

python
# Loop through each variable - Python
for v in vars:
     print(v)

end


// Stata Code - List first five observations
list in 1/5
list make trunk weight foreign in 1/5

// Stata Code - List last five observations
list in -5/-1
list make trunk weight foreign in -5/-1

python
# Python Code - List first five observations
Data.list(obs=range(0,5))
Data.list('make trunk weight foreign', obs=range(0,5))

df.head()
df[['make','trunk','weight','foreign']].head()

df.tail()
df[['make','trunk','weight','foreign']].tail()

end

// Stata Code - Describe the data set
desc

python
# Python Code - Describe the data set
df.info()

for var in vars:
    print('{:18} {:12} {}'.format(var, Data.getVarType(var), Data.getVarLabel(var)))

pd.DataFrame({'Variable Name':vars, 
              'Data Type':[Data.getVarType(v) for v in vars], 
              'Variable Label':[Data.getVarLabel(l) for l in vars]})
			  
auto_info =  pd.DataFrame({'Variable Name':vars, 
                           'Data Type':[Data.getVarType(v) for v in vars], 
                           'Variable Label':[Data.getVarLabel(l) for l in vars]})
auto_info
			  
end

// Stata Code - Get summary statistics
sum

python
# Python Code - Get summary statistics
df.describe()

df.describe().transpose()

end

// Stata Code - Generate new text variable
gen newtxt = "Some text here"

python
# Python Code - Generate new text variable with Stata API
Data.addVarStr('newtxt', 20)
Data.store('newtxt', None, ['Some text here'] * Data.getObsTotal())

# Python Code - Generate new text variable with Stata API & Pandas
df['newtxt'] = 'Some text here'
Data.store('newtxt', None, df['newtxt'])

end

// Stata Code - Transform continuous to binary
gen isExpensive = price > 3000

// Stata Code - Create text based categorical
gen Expensive = "Affordable"
replace Expensive = "Expensive" if price > 4000

gen Long = "Short"
replace Long = "Long" if length > 187

drop isExpensive

python
# Python Code - Transform continuous to binary Combine Stata's API with Pandas
Data.addVarByte('isExpensive')
df['isExpensive'] = [1 if p > 4000 else 0 for p in df['price']]
Data.store('isExpensive', None, df['isExpensive'])

# Python code - Create text based categorical
df['Expensive'] = ['Expensive' if p > 4000 else \
                   'Affordable' for p in df['price']]

df['Long'] = ['Long' if len > 187 else \
              'Short' for len in df['length']]


end

// Stata Code - Interact price with itself
gen price2 = price * price

drop price2

python
# Python Code - Interact price with itself
Data.addVarInt('price2')
df['price2'] = df['price'].apply(lambda p: p * p)
Data.store('price2', None, df['price2'])

end

/*************************************************************
**
**   Problematic code. Expected behavior is to delete
**   price2 column from data frame. Experienced behavior
**   is output that indicates delimiter modification.
**
**   del['price2']
**   df['price2'] = df['price'] * df['price']
**
***************************************************************/

// Stata Code - Tabulate two categorical
tab Expensive foreign

python
# Python Code - Tabulate two categorical
pd.crosstab(df['Expensive'], df['foreign'])

end

// Stata Code - Tabulate three categorical
table rep78 Expensive foreign

python
# Python Code - Tabulate three categorical
pd.crosstab(df['rep78'], [df['Expensive'], df['foreign']])

end

// Stata Code - Tabulate four categorical
table foreign rep78 isExp, by(Long)

python
# Python Code - Tabulate four categorical
pd.crosstab([df['Long'], df['foreign']],
            [df['Expensive'], df['rep78']])

end

// Stata Code - The 'missing option'
tab rep78, missing

python
# Python Code - The '.fillna()' method
df['rep78'].fillna('None').value_counts()

end

// Stata Code - Merge two datasets
use http://www.stata-press.com/data/r15/autoexpense.dta, clear
merge 1:1 make using http://www.stata-press.com/data/r15/autosize.dta

python
# Python Code - Load seperate data sets
expns_df = pd.read_stata('http://www.stata-press.com/data/r15/autoexpense.dta')
sizes_df = pd.read_stata('http://www.stata-press.com/data/r15/autosize.dta')

# Perform the merge operation
# df = pd.merge(expns_df, sizes_df, on='make', how='outer', indicator=True)

# Display results using the '_merge' indicator variable
# df['_merge'].value_counts()

end