Begin by copying this file into the main directory of your workspace.

In [ ]:
import sys
!{sys.executable} -m pip install plotly
import pandas as pd
import numpy as np
import plotly.express as px
In [ ]:
test_demo_dat = pd.read_csv(
    'gcs/pre-processed/test/test_demographics.csv')
train_demo_dat = pd.read_csv(
    'gcs/pre-processed/train/train_demographics.csv')
valid_demo_dat = pd.read_csv(
    'gcs/pre-processed/valid/valid_demographics.csv')

test_encount_dat = pd.read_csv(
    'gcs/pre-processed/test/test_encounters.csv')
train_encount_dat = pd.read_csv(
    'gcs/pre-processed/train/train_encounters.csv')
valid_encount_dat = pd.read_csv(
    'gcs/pre-processed/valid/valid_encounters.csv')

We start by loading in the data. Once this is done, and we have the testing, training, and validation datasets for both demographic and encounter data, we can join these two datasets to get demographics on the patient level rather than the encounter level.

In [ ]:
## Get encounter data, dropping any duplicates
encount_dat = test_encount_dat.append(
    train_encount_dat).append(valid_encount_dat)
num_enc = len(encount_dat[['ENCOUNTER_NUM']].drop_duplicates())
demo_dat = test_demo_dat.append(train_demo_dat).append(valid_demo_dat)
num_dem = len(encount_dat[['ENCOUNTER_NUM']].drop_duplicates())

encount_dat_pt = encount_dat[['ENCOUNTER_NUM','PATIENT_DK']]
num_enc = len(encount_dat_pt['ENCOUNTER_NUM'])
patient_dat = test_demo_dat.append(
    train_demo_dat).append(valid_demo_dat)

# Join with patient data

patient_dat = patient_dat.merge(
    encount_dat_pt, on='ENCOUNTER_NUM',how='left')


patient_sex = patient_dat[['PATIENT_DK','sex']]
patient_sex_unique = patient_sex.drop_duplicates()

Once the data is prepped, we can group on the sex field to get patient counts by sex.

In [ ]:
patient_sex = patient_dat[['PATIENT_DK','sex']]
patient_sex_unique = patient_sex.drop_duplicates()
sex = patient_sex_unique[['PATIENT_DK','sex']].groupby(
    'sex',as_index=False).count()
sex['sex'] = ['M','F']
sex.columns = ['Sex','Count']

sex

Finally, we use the plotly package to plot the results.

In [ ]:
fig_sex = px.bar(sex, x="Sex", y="Count", color = "Sex", 
                 labels={"Sex" : "Sex",
                         "Count" : "Number of Patients"})
fig_sex.update_layout(paper_bgcolor="#f9f9f9")
fig_sex.update_layout(showlegend=False)

fig_sex