aim_i_annotate_triangulate_iaa.py

# -*- coding: utf-8 -*-
"""aim_i_annotate_triangulate_iaa.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1EO--qaVtiIhMURQkiNTKckpO2cEzSOg-

# Passive suicidality in a repressive U.S. political context: Aim I

_WIP - NOT FOR DISTRIBUTION_

_Imports, re-indexes by date, cleans, reduces, restricts by timeframe; permits regex pattern-matched purposive (Wave 1) and random (Wave 2) sampling and named entity redaction of PushShift/Arctic Shift .jsonl Reddit archives for .xlsx annotation. Computes Cohen's Kappa post-annotation. Performs LLM-assisted per-tag triangulation of annotation discrepancies._

> aim_i_annotate_triangulate_iaa.ipynb<br>
> Simone J. Skeen (10-07-2024)

1. [Prepare](#scrollTo=R4qNxJPa9Cmq)
2. [Write](#scrollTo=WTtuLBqA-z6Q)
2. [Pre-annotation](#scrollTo=lZ_RbpSm7F4f)
3. [Wave I: purposive](#scrollTo=ou-3A98QE_-T)
4. [Wave II: random](#scrollTo=5ssC99HPZH84)
5. [Post-annotation](#scrollTo=ZzSVBMiubjEu)
6. [Human-LLM triangulation](#scrollTo=eMblXk-8_Bd4)
8. [Visualize](#scrollTo=aaI7x86-xZRR)

### 1. Prepare
Installs, imports, and downloads requisite models and packages.
***

**Install**
"""

# Commented out IPython magic to ensure Python compatibility.
#%%capture

# %pip install irrCAC
# %pip install openai
#%pip install --upgrade openai
#%pip install --upgrade pydantic

!python -m spacy download en_core_web_lg --user

"""**Import**"""

import en_core_web_lg
import gzip
import json
import matplotlib.pyplot as plt
import numpy as np
import openai
import os
import pandas as pd
import random
import re
import spacy
import time
import warnings
import webbrowser

from collections import Counter
from google.colab import drive
from irrCAC.raw import CAC
from openai import OpenAI
from sklearn.metrics import cohen_kappa_score
from sklearn.utils import shuffle

spacy.cli.download('en_core_web_lg')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.set_option(
              'display.max_columns',
              None,
              )
pd.set_option(
              'display.max_rows',
              None,
              )

warnings.simplefilter(
                      action = 'ignore',
                      category = FutureWarning,
                      )

#!python -m prodigy stats

"""**Mount gdrive**"""

drive.mount(
            '/content/drive',
            #force_remount = True,
            )

"""**Structure directories**"""

# Commented out IPython magic to ensure Python compatibility.
# %cd /content/drive/My Drive/Colab/bar_policy_suicidality
#%cd /content/drive/My Drive/#<my_project_folder>

#%mkdir bar_policy_suicidality
#%cd bar_policy_suicidality

#%mkdir code inputs

#%cd inputs
#%mkdir annotation archives data

bar_policy_suicidality/
├── code
└── inputs/
    ├── annotation
    ├── archives
    │   └── ### archive name TKTK
    └── data

"""### 2. Write
Writes and imports condense.py, redact.py, triangulate.py, llm_assist.py.
***
"""

# Commented out IPython magic to ensure Python compatibility.
# %cd code

"""#### condense.py

**_subreddit_dataframe_condense_**
"""

# Commented out IPython magic to ensure Python compatibility.
# %%writefile condense.py
# 
# import pandas as pd
# 
# def subreddit_dataframe_condense(df):
#     """
#     Reassigns Pushshift archives to condensed df for annotation, assigns columns for strain,
#     explicit targeting, implicit vulnerability tags
#     """
#     df = df[[
#              'author',
#              'created_utc',
#              'date',
#              'id',
#              'num_comments',
#              'selftext',
#              'subreddit',
#              'title',
#              ]].copy()
# 
#     df.rename(
#               columns = {
#                          'author': 'p_au',
#                          'created_utc': 'p_utc',
#                          'date': 'p_date',
#                          'id': 'p_id',
#                          'num_comments': 'n_cmnt',
#                          'selftext': 'text',
#                          'subreddit': 'sbrt',
#                          'title': 'p_titl',
#                          }, inplace = True,
#             )
# 
#     df = df.assign(
#                    asp = ' ',      ### s_1...3 strains
#                    asp_rtnl = ' ',
#                    dep = ' ',
#                    dep_rtnl = ' ',
#                    val = ' ',
#                    val_rtnl = ' ',
#                    prg = ' ',      ### E_1,2 explicit targeting
#                    tgd = ' ',
#                    age = ' ',      ### I_1...3 implicit vulnerabilities
#                    race = ' ',
#                    dbty = ' ',
#                    insb = ' ',     ### insubstantial
#                    )
# 
#     df = df[~df['text'].isin([
#                               '[deleted]',
#                               '[removed]',
#                               ])]
# 
#     return df

"""**_subreddit_parse_**"""

# Commented out IPython magic to ensure Python compatibility.
# %%writefile -a condense.py
# 
# import pandas as pd
# 
# def subreddit_parse(df, col):
#     """
#     Parses df by subreddit, returns dict 'sub_d' of subreddit-specific df objects.
#     """
#     uniq_val = df[col].unique()
#     sub_d = {}
#     for val in uniq_val:
#         sub_d[f'd_{val}'] = df[df[col] == val].copy()
# 
#     return sub_d

"""#### redact.py"""

# Commented out IPython magic to ensure Python compatibility.
# %%writefile redact.py
# 
# import spacy
# nlp = spacy.load('en_core_web_lg')
# 
# def ner_redact_post_texts(p_text):
#     """
#     Redacts all named entities recognized by spaCy EntityRecognizer, replaces with <|PII|> pseudo-word token.
#     """
#     ne = list(
#               [
#                'PERSON',   ### people, including fictional
#                'NORP',     ### nationalities or religious or political groups
#                'FAC',      ### buildings, airports, highways, bridges, etc.
#                'ORG',      ### companies, agencies, institutions, etc.
#                #'GPE',     ### countries, cities, states
#                'LOC',      ### non-GPE locations, mountain ranges, bodies of water
#                'PRODUCT',  ### objects, vehicles, foods, etc. (not services)
#                'EVENT',    ### named hurricanes, battles, wars, sports events, etc.
#                ]
#                 )
# 
#     doc = nlp(p_text)
#     ne_to_remove = []
#     final_string = str(p_text)
#     for sent in doc.ents:
#         if sent.label_ in ne:
#             ne_to_remove.append(str(sent.text))
#     for n in range(len(ne_to_remove)):
#         final_string = final_string.replace(
#                                             ne_to_remove[n],
#                                             '<|PII|>',
#                                             )
#     return final_string

"""#### triangulate.py

**_calculate_kappa_by_cycle_**
"""

# Commented out IPython magic to ensure Python compatibility.
# %%writefile triangulate.py
# 
# import pandas as pd
# from sklearn.metrics import cohen_kappa_score
# 
# def calculate_kappa_by_cycle(cycle_num):
#     """
#     Calculate Cohen's Kappa and encode disagreements between independent annotators across multiple cycles.
# 
#     Parameters:
#     -----------
#     cycle_num : int
#         Annotation cycle number, used to load the corresponding Excel files (e.g., cycle 0, cycle 1).
# 
#     Returns:
#     --------
#     d : pd.DataFrame
#         Processed df after merging, includes encoded disagreements in *_dis columns.
# 
#     kappa_results : dict
#         A dictionary containing the Cohen's Kappa scores for each indepednently co-annotated target.
#     """
#     # read independently annotated files
# 
#     d_sd = pd.read_excel(f'd_cycle{cycle_num}_sd.xlsx', index_col = [0])
#     d_sd.columns = [f'{col}_sd' for col in d_sd.columns]
# 
#     d_ss = pd.read_excel(f'd_cycle_{cycle_num}_ss.xlsx', index_col = [0])
#     d_ss.columns = [f'{col}_ss' for col in d_ss.columns]
# 
#     # merge
# 
#     d = pd.merge(
#                  d_sd,
#                  d_ss,
#                  left_index = True,
#                  right_index = True,
#                  )
# 
#     # housekeeping
# 
#     targets = [
#                'asp_sd', 'asp_ss',
#                'dep_sd', 'dep_ss',
#                'val_sd', 'val_ss',
#                'prg_sd', 'prg_ss',
#                'tgd_sd', 'tgd_ss',
#                'age_sd', 'age_ss',
#                'race_sd', 'race_ss',
#                'dbty_sd', 'dbty_ss',
#                'insb_sd', 'insb_ss',
#               ]
# 
#     texts = [
#              'text_sd', 'text_ss',
#              'asp_rtnl_sd', 'asp_rtnl_ss',
#              'dep_rtnl_sd', 'dep_rtnl_ss',
#              'val_rtnl_sd', 'val_rtnl_ss',
#              ]
# 
#     d[targets] = d[targets].apply(
#                                   pd.to_numeric,
#                                   errors = 'coerce',
#                                   )
#     d[targets] = d[targets].fillna(0)
#     d[texts] = d[texts].replace(' ', '.')
# 
#     d = d[[
#            'p_id_sd', 'p_id_ss', ### sense-check for bad merge
#            'text_sd',
#            'asp_sd', 'asp_ss',
#            'asp_rtnl_sd', 'asp_rtnl_ss',
#            'dep_sd', 'dep_ss',
#            'dep_rtnl_sd', 'dep_rtnl_ss',
#            'val_sd', 'val_ss',
#            'val_rtnl_sd', 'val_rtnl_ss',
#            'prg_sd', 'prg_ss',
#            'tgd_sd', 'tgd_ss',
#            'age_sd', 'age_ss',
#            'race_sd', 'race_ss',
#            'dbty_sd', 'dbty_ss',
#            'insb_sd', 'insb_ss',
#            ]].copy()
# 
#     d.rename(
#              columns = {
#                         'text_sd': 'text',
#                         }, inplace = True,
#             )
# 
#     # kappa Fx
# 
#     def calculate_kappa(d, col_sd, col_ss):
#         return cohen_kappa_score(d[col_sd], d[col_ss])
# 
#     col_pairs = [
#                  ('asp_sd', 'asp_ss'),
#                  ('dep_sd', 'dep_ss'),
#                  ('val_sd', 'val_ss'),
#                  #('prg_sd', 'prg_ss'),
#                  #('tgd_sd', 'tgd_ss'),
#                  #('age_sd', 'age_ss'),
#                  #('race_sd', 'race_ss'),
#                  #('dbty_sd', 'dbty_ss'),
#                  ]
# 
#     # initialize dict
# 
#     kappa_results = {}
# 
#     # kappa loop
# 
#     for col_sd, col_ss in col_pairs:
#         kappa = calculate_kappa(d, col_sd, col_ss)
#         kappa_results[f'{col_sd} and {col_ss}'] = kappa
# 
#     for pair, kappa in kappa_results.items():
#         print(f"Cohen's Kappa for {pair}: {kappa:.2f}")
# 
#     # dummy code disagreements Fx
# 
#     def encode_disagreements(row):
#         return 1 if row[0] != row[1] else 0
# 
#     col_dis = [
#                ('asp_sd', 'asp_ss', 'asp_dis'),
#                ('dep_sd', 'dep_ss', 'dep_dis'),
#                ('val_sd', 'val_ss', 'val_dis'),
#                #('prg_sd', 'prg_ss', 'prg_dis'),
#                #('tgd_sd', 'tgd_ss', 'tgd_dis'),
#                #('age_sd', 'age_ss', 'age_dis'),
#                #('race_sd', 'race_ss', 'race_dis'),
#                #('dbty_sd', 'dbty_ss', 'dbty_dis'),
#                ]
# 
#     for col1, col2, dis_col in col_dis:
#         d[dis_col] = d[[col1, col2]].apply(encode_disagreements, axis = 1)
# 
#     # export: cycle-specific
# 
#     d.to_excel(f'd_cycle{cycle_num}_iaa.xlsx')
# 
#     return d, kappa_results

"""#### llm_assist.py"""

# Commented out IPython magic to ensure Python compatibility.
# %%writefile llm_assist.py
# 
# import time
# import openai
# 
# api_key = '<my_key>'
# client = openai.OpenAI(api_key = api_key)
# 
# def annotate_post_per_tag(text, prompts):
#     """
#     Applies annotation decisions, based on multiple prompts, to a given text; provides rationale and explanation.
#     Parameters:
#     - text: The text to annotate.
#     - prompts: A list of prompts to apply to the text.
# 
#     Returns:
#     - result: The combined result from all prompts.
#     """
#     try:
# 
#         # concatenate prompts
# 
#         prompt_content = ' '.join(prompts)
# 
#         response = client.chat.completions.create(
#             model = 'gpt-4o',
#             temperature = 0.2,
#             messages = [
#                 {
#                     'role': 'system',
#                     'content': prompt_content
#                 },
#                 {
#                     'role': 'user',
#                     'content': text
#                 }
#             ]
#         )
# 
#         # collect results
# 
#         result = ' '
#         for choice in response.choices:
#             result += choice.message.content
# 
#         print(f'{text}: {result}')
#         return result
#     except Exception as e:
#         print(f'Exception: {e}')
#         return 'error'

"""**_annotate_dataframe_per_tag_**"""

# Commented out IPython magic to ensure Python compatibility.
# %%writefile -a llm_assist.py
# 
# def annotate_dataframe_per_tag(df, prompts_per_tag):
#     """
#     Applies annotate_post_per_tag for multiple tags to each row in dataframe 'd'.
# 
#     Parameters:
#     - df: The dataframe containing texts to annotate.
#     - prompts_per_tag: A dictionary with tag names as keys and a list of prompts as values.
# 
#     Returns:
#     - df: The updated dataframe with annotation results.
#     """
#     for index, row in df.iterrows():
#         for tag, prompts in prompts_per_tag.items():
#             result = annotate_post_per_tag(row['text'], prompts)
#             if result == 'error':
#                 continue
# 
#             # extract rationale, chain of thought ("explanation")
# 
#             rationale, explanation = None, None
#             if f'{tag}_1' in result:
#                 tag_value = 1
#                 rationale = result.split(f'{tag}_rationale:')[1].split(f'strained {tag}:')[0].strip() if f'{tag}_rationale:' in result else None
# 
#             # excise {tag}_explanation and subsequent text from rationale
# 
#                 if rationale and f'{tag}_explanation:' in rationale:
#                     rationale = rationale.split(f'{tag}_explanation:')[0].strip()
# 
#                 #if f'{tag}_explanation:' in rationale:
#                 #    rationale = rationale.split(f'{tag}_explanation:')[0].strip()
# 
#                 explanation = result.split(f'{tag}_explanation:')[1].strip() if f'{tag}_explanation:' in result else None
#             else:
#                 tag_value = 0
# 
#             # results to df
# 
#             df.at[index, f'{tag}_gpt'] = tag_value
#             df.at[index, f'{tag}_rtnl_gpt'] = rationale
#             df.at[index, f'{tag}_expl_gpt'] = explanation
# 
#             # impose delay between API calls
# 
#             time.sleep(1)
# 
#     return df

"""#### Import"""

from condense import(
                     subreddit_dataframe_condense,
                     subreddit_parse,
                     )

from redact import(
                   ner_redact_post_texts,
                   )

from triangulate import(
                        calculate_kappa_by_cycle,
                        )

from llm_assist import(
                       annotate_post_per_tag,
                       annotate_dataframe_per_tag,
                       )

"""### 3. Pre-annotation
Import and format Reddit archives.
***
"""

# read in .json.gz archives

wd = '/content/drive/MyDrive/Colab/bar_policy_suicidality/inputs/archives' ### Colab - gdrive

#wd = 'C:/Users/sskee/OneDrive/Documents/02_tulane/01_research/03_prospectus/d_posts' ### Jupyter - local

ds = []

# loop over .json.gz

for filename in os.listdir(wd):
    if filename.endswith('.json.gz'):
        filepath = os.path.join(
                                wd,
                                filename,
                                )
        with gzip.open(
                       filepath,
                       'rt', ### 'open for reading', 'text mode'
                       encoding = 'utf-8',
                       ) as i:
            data = [json.loads(line) for line in i]
            d_gz = pd.DataFrame(data)
            ds.append(d_gz)

# concatenate

d = pd.concat(
              ds,
              ignore_index = True,
              )

# harmonize

d = d.dropna(
             axis = 1,
             how = 'any',
             )

# de-duplicate

d = d.drop_duplicates(
                      subset = 'id',
                      )

# re-index

d['date'] = pd.to_datetime(
                           d.created_utc,
                           unit = 's',
                           )

d.set_index(
            'date',
            drop = False,
            inplace = True,
            )

# inspect

d.shape
d.dtypes
d.sample(3)

# housekeeping

d = subreddit_dataframe_condense(d)

# restrict timeframe

d = d.loc[(d['p_date'] >= '2022-01-01') & (d['p_date'] <= '2022-12-31')]

# verify

d.shape
sbrt = d['sbrt'].unique()
print(sbrt)
d.head(1)
d.tail(1)

# plot

monthly_counts = d.resample('M').sbrt.value_counts().unstack().fillna(0)

fig, ax = plt.subplots(figsize=(10, 6))

monthly_counts.plot(
                    kind = 'line',
                    ax = ax,
                    )

box = ax.get_position()

ax.set_position(
                [
                 box.x0,
                 box.y0,
                 box.width * 0.8,
                 box.height,
                 ]
                  )

ax.legend(
          loc = 'center left',
          bbox_to_anchor=(1, 0.5),
          )

plt.show()

# parse by subreddit

sub_d = subreddit_parse(
                        d,
                        'sbrt',
                        )

"""**Subset A: strain (_$\hat{s}_{1-3}$_) proxy**"""

#d_ax = sub_d['d_Anxiety'] ### deprecated
d_dp = sub_d['d_depression']
#d_mh = sub_d['d_mentalhealth'] ### deprecated
d_sw = sub_d['d_SuicideWatch']

#print('r/Anxiety')
#d_ax.shape
print("\nr/depression")
d_dp.shape
#print("\nr/mentalhealth")
#d_mh.shape
print("\nr/SuicideWatch")
d_sw.shape

"""**Subset B: explicit targeting (_$E_{1,2}$_) proxy**"""

d_gs = sub_d['d_TheGirlSurvivalGuide']
d_tr = sub_d['d_trans']
d_tx = sub_d['d_TwoXChromosomes']

print("r/TheGirlSurvivalGuide")
d_gs.shape
print("\nr/Trans")
d_tr.shape
print("\nr/TwoXChromosomes")
d_tx.shape

"""### 4. Wave I: purposive
Iterates over regex formulations, subreddit curation, Cycle 0--4 sampling.Includes Cycle 999 training data sampling.
***

#### Cycle 0 (_$n$_ = 100)
"""

# Commented out IPython magic to ensure Python compatibility.
# pilot annotation cycle using r/SuicideWatch

# %cd ../annotation

'.gend\S*|pregnan\S*' ### a priori/canonical
'trans|non-?binary|dysphor\S*|hormone|abort\S*|dobbs|roe' ### inductively derived

rg = re.compile('.gend\S*|pregnan\S*|trans|non-?binary|dysphor\S*|hormone|abort\S*|dobbs|roe', re.I)

d = d.loc[d['text'].str.contains(
                                 rg,
                                 regex = True,
                                 )]

d['text'] = d['text'].astype(str).apply(lambda i: ner_redact_post_texts(i))
d.shape

# export: 'd_cycle*.xlsx'

d.to_excel('d_cycle0.xlsx')

"""#### Cycle 1 (_$n$_ = 100)"""

# subset A: r/anxiety, r/depression, r/mentalhealth, r/SuicideWatch

d_a = pd.concat([
                 d_ax,
                 d_dp,
                 d_mh,
                 d_sw,
                 ])

# subset B: r/trans

d_a.reset_index(
                drop = True,
                inplace = True,
                )

d_b = d_tr.copy()

# subset A

'.gend\S*|pregnan\S*' ### a priori/canonical
'trans|non-?binary|dysphor\S*|hormone|abort\S*|dobbs|roe' ### inductively derived

rg_a = re.compile('.gend\S*|pregnan\S*|trans|non-?binary|dysphor\S*|hormone|abort\S*|dobbs|roe', re.I)

d_a = d_a.loc[d_a['text'].str.contains(
                                       rg_a,
                                       regex = True,
                                       )]

d_a.shape

# subset B

'.criminal\S*|restrict\S*|.law|.legal\S*' ### a priori/canonical

rg_b = re.compile('.criminal\S*|restrict\S*|.law|.legal\S*', re.I)

d_b = d_b.loc[d_b['text'].str.contains(
                                       rg_b,
                                       regex = True,
                                       )]

d_b.shape

d = pd.concat([
               d_a, # n = 9740
               d_b, # n = 1505
               ])

d.shape # N = 11245

# Commented out IPython magic to ensure Python compatibility.
# %cd ../annotation

d = d.sample(n = 100)

d['text'] = d['text'].astype(str).apply(lambda i: ner_redact_post_texts(i))
d.shape

# export: 'd_cycle*.xlsx'

d.to_excel('d_cycle1.xlsx')

"""#### Cycle 2 (_$n$_ = 100)"""

# subset A: r/depression, r/SuicideWatch

d_a = pd.concat([
                 d_dp,
                 d_sw,
                 ])

d_a.reset_index(
                drop = True,
                inplace = True,
                )

# subset B: r/trans

d_b = d_tr.copy()

# subset A

'.gend\S*|pregnan\S*' ### a priori/canonical
'trans|non-?binary|dysphor\S*|hormone|abort\S*|dobbs|roe' ### inductively derived

rg_a = re.compile('.gend\S*|pregnan\S*|trans|non-?binary|dysphor\S*|hormone|abort\S*|dobbs|roe', re.I)

d_a = d_a.loc[d_a['text'].str.contains(
                                       rg_a,
                                       regex = True,
                                       )]

d_a.shape

# subset B

'.criminal\S*|restrict\S*|illegal\S*|outlaw\S*|suicid\S*' ### a priori/canonical

rg_b = re.compile('.criminal\S*|restrict\S*|illegal\S*|outlaw\S*|suicid\S*', re.I)

d_b = d_b.loc[d_b['text'].str.contains(
                                       rg_b,
                                       regex = True,
                                       )]

d_b.shape

d = pd.concat([
               d_a, # n = 5602
               d_b, # n = 729
               ])

d.shape # N = 6331

# Commented out IPython magic to ensure Python compatibility.
# %cd ../annotation

d = d.sample(n = 100)

d['text'] = d['text'].astype(str).apply(lambda i: ner_redact_post_texts(i))

# export: 'd_cycle*.xlsx'

d.to_excel('d_cycle2.xlsx')

"""#### Cycle 3 (_$n$_ = 150)"""

# subset A: r/depression, r/SuicideWatch

d_a = pd.concat([
                 d_dp,
                 d_sw,
                 ])

d_a.reset_index(
                drop = True,
                inplace = True,
                )

# subset B: r/TheGirlsSurvivalGuide, r/trans, r/TwoXChromosomes

d_b = pd.concat([
                 d_gs,
                 d_tr,
                 d_tx,
                 ])

d_b.reset_index(
                drop = True,
                inplace = True,
                )

# subset A

'.gend\S*|pregnan\S*' ### a priori/canonical
'trans|non-?binary|dysphor\S*|hormone|abort\S*|dobbs|roe' ### inductively derived

rg_a = re.compile('.gend\S*|pregnan\S*|trans|non-?binary|dysphor\S*|hormone|abort\S*|dobbs|roe', re.I)

d_a = d_a.loc[d_a['text'].str.contains(
                                       rg_a,
                                       regex = True,
                                       )]

d_a.shape

# subset B

'.criminal\S*|restrict\S*|illegal\S*|outlaw\S*|suicid\S*' ### a priori/canonical

rg_b = re.compile('.criminal\S*|restrict\S*|illegal\S*|outlaw\S*|suicid\S*', re.I)

d_b = d_b.loc[d_b['text'].str.contains(
                                       rg_b,
                                       regex = True,
                                       )]

d_b.shape

d = pd.concat([
               d_a, # n = 5602
               d_b, # n = 1971
               ])

d.shape # N = 7573

# Commented out IPython magic to ensure Python compatibility.
# %cd ../annotation

d = d.sample(n = 150)

d['text'] = d['text'].astype(str).apply(lambda i: ner_redact_post_texts(i))

# export: 'd_cycle*.xlsx'

d.to_excel('d_cycle3.xlsx')

"""#### Cycle 999 (_$n$_ = 1,000): initial training data (purposive)"""

d_999 = d.sample(n = 1000)

d_999['text'] = d_999['text'].astype(str).apply(lambda i: ner_redact_post_texts(i))

# export: 'd_cycle*.xlsx'

d_999.to_excel('d_cycle999.xlsx')

"""#### Cycle 4 (_$n$_ = 150)"""

# Commented out IPython magic to ensure Python compatibility.
# supplementing _prg_: candidate subreddits

# %cd /content/gdrive/My Drive/Colab/bar_policy_suicidality/inputs/archives

d_aw = pd.read_json(
                    #'r_thegirlsurvivalguide_posts.jsonl', # d_gs.xlsx
                    #'r_confession_posts.jsonl', # d_co.xlsx
                    'r_askwomenadvice_posts.jsonl', # d_aw.xlsx
                    #'r_traumatoolbox_posts.jsonl', # d_tb.xlsx
                    #'r_birthcontrol_posts.jsonl', # d_bc.xlsx
                    #'r_WomensHealth_posts.jsonl', # d_wh.xlsx
                    lines = True,
                    )

    ### SJS 7/15: decision: adding r/askwomenadvice to Cycle 4

d_aw.shape
d_aw.head(3)
d_aw.tail(3)

# supplementary housekeeping: r/askwomenadvice

# harmonize

d_aw = d_aw.dropna(
                   axis = 1,
                   how = 'any',
                   )

# de-duplicate

d_aw = d_aw.drop_duplicates(
                            subset = 'id',
                            )

# re-index

d_aw['date'] = pd.to_datetime(
                              d_aw.created_utc,
                              unit = 's',
                              )

d_aw.set_index(
               'date',
               drop = False,
               inplace = True,
               )

# housekeeping

d_aw = subreddit_dataframe_condense(d_aw)

# anonymize

d_aw['text'] = d_aw['text'].astype(str).apply(lambda i: ner_redact_post_texts(i))

d_aw.shape
d_aw.head(3)
d_aw.tail(3)

# subset A: r/depression, r/SuicideWatch

d_a = pd.concat([
                 d_dp,
                 d_sw,
                 ])

d_a.reset_index(
                drop = True,
                inplace = True,
                )

# subset B: r/trans, r/TwoXChromosomes

d_b = pd.concat([
                 d_tr,
                 d_tx,
                 ])

d_b.reset_index(
                drop = True,
                inplace = True,
                )

# subset C: r/askwomenadvice, r/TheGirlsSurvivalGuide

d_c = pd.concat([
                 d_aw,
                 d_gs,
                 ])

d_c.reset_index(
                drop = True,
                inplace = True,
                )

ds = [
      d_a,
      d_b,
      d_c,
      ]

for d in ds:
    d = shuffle(d)

# subset A

'.gend\S*|pregnan\S*' ### a priori/canonical
'trans|non-?binary|dysphor\S*|hormone|abort\S*|dobbs|roe' ### inductively derived

rg_a = re.compile('.gend\S*|pregnan\S*|trans|non-?binary|dysphor\S*|hormone|abort\S*|dobbs|roe', re.I)

d_a = d_a.loc[d_a['text'].str.contains(
                                       rg_a,
                                       regex = True,
                                       )]

d_a.shape

# subset B

'.criminal\S*|restrict\S*|illegal\S*|outlaw\S*|suicid\S*|dobbs|roe|pregnan\S*' ### a priori/canonical

rg_b = re.compile('.criminal\S*|restrict\S*|illegal\S*|outlaw\S*|suicid\S*|dobbs|roe|pregnan\S*', re.I)

d_b = d_b.loc[d_b['text'].str.contains(
                                       rg_b,
                                       regex = True,
                                       )]

d_b.shape

# subset C

d_c = d_c.loc[d_c['text'].str.contains(
                                       rg_b,
                                       regex = True,
                                       )]

d_c.shape

d = pd.concat([
               d_a, # n = 5602
               d_b, # n = 5478
               ])

d.shape # N = 11080

d = d.sample(n = 130)

d['text'] = d['text'].astype(str).apply(lambda i: ner_redact_post_texts(i))

# oversampling for _prg_ - Cycle 4

d_suppl = d_c.sample(n = 20)

d_suppl['text'] = d_suppl['text'].astype(str).apply(lambda i: ner_redact_post_texts(i))

d = pd.concat([
               d, # n = 130
               d_suppl, # n = 20
               ])

d = shuffle(d)

# export: 'd_cycle*.xlsx'

d.to_excel('d_cycle4.xlsx')

"""#### Cycle 999 (_$n$_ = 200): supplementary training data (_$E_{1}$_: 'prg')"""

# Commented out IPython magic to ensure Python compatibility.
# %cd ../annotation

# oversampling for _prg_ - Cycle 999

d_suppl = d_c.sample(n = 200)

d_suppl['text'] = d_suppl['text'].astype(str).apply(lambda i: ner_redact_post_texts(i))

# export: 'd_cycle*.xlsx'

d_suppl.to_excel('d_cycle999_suppl.xlsx')

"""#### Cycle 999 (_$n$_ = 100): supplementary training data (_$I_{2}$_: 'dbty')"""

# Commented out IPython magic to ensure Python compatibility.
# supplementing _dbty_: 2022 r/Disability posts

# %cd /content/drive/My Drive/Colab/bar_policy_suicidality/inputs/archives

d_db = pd.read_json(
                    'r_Disability_posts.jsonl',
                    lines = True,
                    )

d_db.info()
d_db.head(3)
d_db.tail(3)

# harmonize

d_db = d_db.dropna(
                   axis = 1,
                   how = 'any',
                   )

# de-duplicate

d_db = d_db.drop_duplicates(
                            subset = 'id',
                            )

# re-index

d_db['date'] = pd.to_datetime(
                              d_db.created_utc,
                              unit = 's',
                              )

d_db.set_index(
               'date',
               drop = False,
               inplace = True,
               )

# housekeeping

d_db = subreddit_dataframe_condense(d_db)

d_db.shape
d_db.head(3)

# subset B (explicit target proxy) for re

'.criminal\S*|restrict\S*|illegal\S*|outlaw\S*|suicid\S*|dobbs|roe|pregnan\S*' ### a priori/canonical

rg_b = re.compile('.criminal\S*|restrict\S*|illegal\S*|outlaw\S*|suicid\S*|dobbs|roe|pregnan\S*', re.I)

d_db = d_db.loc[d_db['text'].str.contains(
                                         rg_b,
                                         regex = True,
                                         )]

d_db.shape # N = 171

# Commented out IPython magic to ensure Python compatibility.
# %cd ../annotation

d_db_suppl = d_db.sample(n = 100)

d_db_suppl['text'] = d_db_suppl['text'].astype(str).apply(lambda i: ner_redact_post_texts(i))

d_db_suppl = shuffle(d_db_suppl)

# export

d_db_suppl.to_excel('d_db_suppl.xlsx')

"""#### Cycle 999 (_$n$_ = 100): supplementary training data (_$I_{3}$_: 'race')"""

# Commented out IPython magic to ensure Python compatibility.
# supplementing _race_: 2022 r/blackpeople posts

# %cd ../archives

d_bp = pd.read_json(
                    'r_blackpeople_posts.jsonl',
                    lines = True,
                    )

# harmonize

d_bp = d_bp.dropna(
                   axis = 1,
                   how = 'any',
                   )

# de-duplicate

d_bp = d_bp.drop_duplicates(
                            subset = 'id',
                            )

# re-index

d_bp['date'] = pd.to_datetime(
                              d_bp.created_utc,
                              unit = 's',
                              )

d_bp.set_index(
               'date',
               drop = False,
               inplace = True,
               )

# housekeeping

d_bp = subreddit_dataframe_condense(d_bp)

d_bp.info()
d_bp.head(3)

# drop empty cells

d_bp = d_bp[d_bp['text'].str.strip() != '']
d_bp.shape # N = 674

# Commented out IPython magic to ensure Python compatibility.
# random sample - too few (n = 17) regex matches

# %cd ../annotation

d_bp_suppl = d_bp.sample(n = 100)

d_bp_suppl['text'] = d_bp_suppl['text'].astype(str).apply(lambda i: ner_redact_post_texts(i))

d_bp_suppl = shuffle(d_bp_suppl)

# export

d_bp_suppl.to_excel('d_bp_suppl.xlsx')

"""### 5. Wave II: random
Performs random sampling, 1:1 purposive:random ratio in initial training data.
***
"""

### SJS 8/15: same _prg_ oversampling strategy: 1000 random from subsets A and B, plus 200 from r/TheGirlsSurvivalGuide and r/AskWomenAdvice

# strain proxies

d_dp.shape
d_dp.head(3)
d_sw.shape
d_sw.head(3)

# explicit targeting proxies

d_aw.shape
d_aw.head(3)
d_gs.shape
d_gs.head(3)
d_tr.shape
d_tr.head(3)
d_tx.shape
d_tx.head(3)

d_a = pd.concat([
                 d_dp, ### r/depression
                 d_sw, ### r/SuicideWatch
                 ])

d_a = shuffle(d_a)

d_a.reset_index(
                drop = True,
                inplace = True,
                )

d_b = pd.concat([
                 d_tr, ### r/trans
                 d_tx, ### r/TwoXChromosomes
                 ])

d_b = shuffle(d_b)

d_b.reset_index(
                drop = True,
                inplace = True,
                )

d_c = pd.concat([
                 d_aw, ### r/AskWomenAdvice,
                 d_gs, ### r/TheGirlsSurvivalGuide
                 ])

d_c = shuffle(d_c)

d_c.reset_index(
                drop = True,
                inplace = True,
                )

d_a.shape
d_b.shape
d_c.shape

d = pd.concat([
               d_a, # n = 178030
               d_b, # n = 110213
               ])

d.shape # N = 288243

d = d.sample(n = 1000)

d['text'] = d['text'].astype(str).apply(lambda i: ner_redact_post_texts(i))

d.shape # N = 1000

d_suppl = d_c.sample(n = 200)

d_suppl['text'] = d_suppl['text'].astype(str).apply(lambda i: ner_redact_post_texts(i))

d_suppl.shape # N = 200

d = pd.concat([
               d, # n = 1000
               d_suppl, # n = 200
               ])

d.shape # N = 1200

"""#### Cycle 999 (_$n$_ = 1,200): initial training data (random)"""

# Commented out IPython magic to ensure Python compatibility.
# %cd ../annotation

d.to_excel('d_cycle999_rnd.xlsx')

"""### 6. Post-annotation
Computes Cohen's $\kappa$ for SD-SS independently annotated dataframes per cycle, flags disagreements.
***
"""

# Commented out IPython magic to ensure Python compatibility.
# Cycle 4 - pre-LLM triangulation

# %cd ../inputs/annotation

d, kappa_results = calculate_kappa_by_cycle(4)

# Commented out IPython magic to ensure Python compatibility.
# Cycle 4 - post-LLM triangulation

# %cd ../inputs/annotation

# kappa Fx

def calculate_kappa(d, col_sd, col_ss):
    return cohen_kappa_score(d[col_sd], d[col_ss])

# define SD-SS col pair list

col_pairs = [
             ('asp_sd', 'asp_ss'),
             ('dep_sd', 'dep_ss'),
             ('val_sd', 'val_ss'),
             #('prg_sd', 'prg_ss'),
             #('tgd_sd', 'tgd_ss'),
             #('age_sd', 'age_ss'),
             #('race_sd', 'race_ss'),
             #('dbty_sd', 'dbty_ss'),
             ]

# initialize dict

kappa_results = {}

# kappa loop

for col_sd, col_ss in col_pairs:
    kappa = calculate_kappa(d, col_sd, col_ss)
    kappa_results[f'{col_sd} and {col_ss}'] = kappa

for pair, kappa in kappa_results.items():
    print(f"Cohen's Kappa for {pair}: {kappa:.2f}")

"""### 7. Human-LLM triangulation
Defines prompts for, and performs, GPT-4o-assisted human-in-the-loop annotation using chain-of-thought reasoning.
***

#### _$\hat{s}_{1}$_: 'asp' prompt
"""

role = '''
You are tasked with applying qualitative codes to social media posts to categorize whether each post contains an expression of _aspiration strain_.
'''

definition = '''
Definition of aspiration strain: 'any description of ambition, futurity, idealized or speculative lifecourse trajectories, or personal,
professional, familial goals driving psychological strain and/or self-destructive cognitions.' Descriptions can be explicit or implicit.
'''

instruction = '''
Below I instruct on how to apply the codes.

Respond with 'asp_1' if the post contains an expression of aspiration strain, and '0' if it does not.

You must choose a 'asp_1' or a '0' response.

If your response is 'asp_1,' then begin a new paragraph with 'asp_rationale' and excerpt the sentences or phrases that are the _most expressive of
aspiration strain_. You are allowed to choose multiple sentences or phrases, divided by an '<SPL>' token.

Then, if you have selected a 'asp_1,' begin a new paragraph with 'asp_explanation:' and provide a two sentence explanation for your response.
'''

#strain = '''
#If your response is 'asp_1,' then begin a new paragraph with 'strained aspirations:' and concisely name the strained aspiration that is driving the distressful
#cognitions.
#'''

clarification = '''
Here are additional clarifying points based in human expertise:
-	Regret over, or wishing to redo, a past decision does _not_ warrant a 'asp_1' response
-	Aspiration for physical impossibility (time travel, age reversion), does not warrant a 'asp_1' response
-	Sparse decontextualized expressions of loneliness (e.g. 'I am lonely') do not warrant a 'asp_1' response; recognition of need or yearning for
friendship, community, and/or intimacy must be explicit to warrant a 'asp_1' response
-	Sparse decontextualized desire for a different assigned sex at birth does not warrant a 'asp_1' response; desire for gender transition or
gender-expansive expression must be explicit.
- Perceived inability to die by suicide does not warrant a 'asp_1' response
'''

asp_prompt = f'{role}{definition}{instruction}{clarification}'
#print(asp_prompt)

"""#### _$\hat{s}_{2}$_:  'dep' prompt"""

role = '''
You are tasked with applying qualitative codes to social media posts to categorize whether each post contains an expression of _deprivation strain_.
'''

definition = '''
Definition of deprivation strain: 'any description of present or past inequities or scarcities in financial capital, housing security, material circumstance,
dignity, or decisional autonomy, driving intra-psychic strain and/or self-destructive cognitions.' Descriptions can be explicit or implicit.
'''

instruction = '''
Below I instruct on how to apply the codes.

Respond with 'dep_1' if the post contains an expression of deprivation strain, and '0' if it does not.

You must choose a 'dep_1' or a '0' response.

If your response is 'dep_1,' then begin a new paragraph with 'dep_rationale' and excerpt the sentences or phrases that are the _most expressive of
deprivation strain_. You are allowed to choose multiple sentences or phrases, divided by an '<SPL>' token.

Then, if you have selected a 'dep_1,' begin a new paragraph with 'dep_explanation:' and provide a two sentence explanation for your response.
'''

#strain = '''
#If your response is 'dep_1,' then begin a new paragraph with 'deprivation:' and concisely name the deprivation that is driving the distressful
#cognitions.
#'''

clarification = '''
Here are additional clarifying points based in human expertise:
- Apprehensions over possible _future_ deprivation does not warrant a 'dep_1' response
- Scarcity or inaccessibility of formal systems of care (hospitals, clinics) does warrant a 'dep_1' response
'''

dep_prompt = f'{role}{definition}{instruction}{clarification}'
#print(dep_prompt)

"""#### _$\hat{s}_{3}$_:  'val' prompt"""

role = '''
You are tasked with applying qualitative codes to social media posts to categorize whether each post contains an expression of _value strain_.
'''

definition = '''
Definition of value strain: 'any description of conflicting ideologies, norms, mores, morals, ethics, principles, or ontologies, particularly
along traditionalist-authoritarian versus progressive-liberatory axes, driving intra-psychic strain and/or self-destructive cognitions.' Descriptions
can be explicit or implicit.
'''

instruction = '''
Below I instruct on how to apply the codes.

Respond with 'val_1' if the post contains an expression of value strain, and '0' if it does not.

You must choose a 'val_1' or a '0' response.

If your response is 'val_1,' then begin a new paragraph with 'val_rationale:' and excerpt the sentences or phrases that are the _most expressive of
value strain_. You are allowed to choose multiple sentences or phrases, divided by an '<SPL>' token.

Then, if you have selected a 'val_1,' begin a new paragraph with 'val_explanation:' and provide a two sentence explanation for your response.
'''

#strain = '''
#If your response is 'val_1,' then begin a new paragraph with 'strained values:' and concisely name the strained value that is driving the distressful
#cognitions.
#'''

clarification = '''
Here are additional clarifying points based in human expertise:
-	a 'val_1' response can refer to interpersonal ideological discord and/or internalized ideologically determined self-denigration
-	Anticipated value strain does warrant a 'val_1' response
-	Familial conflict alone is insufficient to warrant a 'val_1' response
-	Perceived failure to uphold self-imposed expectations is insufficient to warrant a 'val_1' response
-	Stress or tension over legality of decisions, necessities does warrant a 'val_1' response
-	ideological or normative aspect must be explicit
'''

val_prompt = f'{role}{definition}{instruction}{clarification}'
#print(val_prompt)

"""#### _$E_{1}$_:  'prg' prompt"""

role = '''
You are tasked with applying qualitative codes to social media posts to categorize whether each post indicates that the author is a person who can get pregnant.
'''

definition = '''
Definition of person who can get pregnant: 'disclosure of pregnancy capability, past pregnancy, specific anatomy (e.g. menstruation, possession
of uterus, etc.), and associated socio-medical needs.' Descriptions must be explicit.
'''

instruction = '''
Below I instruct on how to apply the codes.

Respond with 'prg_1' if the post post indicates that the author is a person who can get pregnant, and '0' if it does not.

You must choose a 'prg_1' or a '0' response.

If your response is 'prg_1,' then begin a new paragraph with 'prg_rationale:' and excerpt the sentences or phrases that are the _most indicative_ of
pregnancy capability. You are allowed to choose multiple sentences or phrases, divided by an '<SPL>' token.

Then, if you have selected a 'prg_1,' begin a new paragraph with 'prg_explanation:' and provide a two sentence explanation for your response.
'''

clarification = '''
Here is an additional clarifying point based in human expertise:
- Descriptions of pregnancy capability must refer, in first person, to the post author - not another person.
- It is _not necessary_ for the post author to identify as a woman.
'''

prg_prompt = f'{role}{definition}{instruction}{clarification}'
#print(prg_prompt)

"""#### _$E_{2}$_:  'tgd' prompt"""

role = '''
You are tasked with applying qualitative codes to social media posts to categorize whether each post indicates that the author is transgender.
'''

definition = '''
Definition of person who is transgender: 'disclosure of identity as transgender, transexual, non-binary, gender-diverse, questioning their
gender, living or identifying as a gender that differes from their sex assigned at birth.' Descriptions must be explicit.
'''

instruction = '''
Below I instruct on how to apply the codes.

Respond with 'tgd_1' if the post post indicates that the author is a person who can get pregnant, and '0' if it does not.

You must choose a 'tgd_1' or a '0' response.

If your response is 'tgd_1,' then begin a new paragraph with 'tgd_rationale:' and excerpt the sentences or phrases that are the _most indicative_ of
transgender identity. You are allowed to choose multiple sentences or phrases, divided by an '<SPL>' token.

Then, if you have selected a 'tgd_1,' begin a new paragraph with 'tgd_explanation:' and provide a two sentence explanation for your response.
'''

clarification = '''
Here is an additional clarifying point based in human expertise:
- Descriptions of transgender identity must refer, in first person, to the post author - not another person.
- Post authors expressing envy of another gender's privilege does not warrant a 'tgd_1' response.
'''

tgd_prompt = f'{role}{definition}{instruction}{clarification}'
#print(tgd_prompt)

"""#### _$I_{1}$_:  'age' prompt"""

role = '''
You are tasked with applying qualitative codes to social media posts to categorize whether each post indicates that the author is younger than 30 years old.
'''

definition = '''
Definition of person who is younger than 30 years old: 'disclosure of age lower than 30 years, including via self-identification as an adolescent, teenager, or
high-schooler' Descriptions must be explicit.
'''

instruction = '''
Below I instruct on how to apply the codes.

Respond with 'age_1' if the post post indicates that the author is a person who can get pregnant, and '0' if it does not.

You must choose an 'age_1' or a '0' response.

If your response is 'age_1,' then begin a new paragraph with 'age_rationale:' and excerpt the sentences or phrases that are the _most indicative_ of
transgender identity. You are allowed to choose multiple sentences or phrases, divided by an '<SPL>' token.

Then, if you have selected an 'age_1,' begin a new paragraph with 'age_explanation:' and provide a two sentence explanation for your response.
'''

clarification = '''
Here is an additional clarifying point based in human expertise:
- Descriptions of age younger than 30 must refer, in first person, to the post author - not another person.
- Post authors introducing themselves with their age and gender (for example 'Hi, 29F here') does warrant a 'age_1' response if the age is lower than 30.
'''

age_prompt = f'{role}{definition}{instruction}{clarification}'
#print(age_prompt)

"""#### _$I_{2}$_:  'dbty' prompt"""

role = '''
You are tasked with applying qualitative codes to social media posts to categorize whether each post indicates that the author is disabled.
'''

definition = '''
Definition of person who is disabled: 'disclosure of a physical disability or debilitating chronic disease.' Descriptions must be explicit.
'''

instruction = '''
Below I instruct on how to apply the codes.

Respond with 'dbty_1' if the post post indicates that the author is disabled, and '0' if it does not.

You must choose a 'dbty_1' or a '0' response.

If your response is 'dbty_1,' then begin a new paragraph with 'dbty_rationale:' and excerpt the sentences or phrases that are the _most indicative_ of
transgender identity. You are allowed to choose multiple sentences or phrases, divided by an '<SPL>' token.

Then, if you have selected a 'dbty_1,' begin a new paragraph with 'dbty_explanation:' and provide a two sentence explanation for your response.
'''

clarification = '''
Here is an additional clarifying point based in human expertise:
- Descriptions of disability must refer, in first person, to the post author - not another person.
- Autism is not a disability.
- Disability can include intellecutal or developmental disability, including cognitive decline, if described explicitly.
'''

dbty_prompt = f'{role}{definition}{instruction}{clarification}'
#print(age_prompt)

"""#### _$I_{3}$_:  'race' prompt"""

role = '''
You are tasked with applying qualitative codes to social media posts to categorize whether each post indicates that the author discloses any non-white race or ethnicity.
'''

definition = '''
Definition of person who of non-white race or ethnicity: 'disclosure of an identity as a Black, Latina, Latino, Latinx, or Latine person, or a person targeted
by racism' Descriptions must be explicit.
'''

instruction = '''
Below I instruct on how to apply the codes.

Respond with 'race_1' if the post post indicates that the author is a person who can get pregnant, and '0' if it does not.

You must choose a 'race_1' or a '0' response.

If your response is 'race_1,' then begin a new paragraph with 'race_rationale:' and excerpt the sentences or phrases that are the _most indicative_ of
transgender identity. You are allowed to choose multiple sentences or phrases, divided by an '<SPL>' token.

Then, if you have selected a 'race_1,' begin a new paragraph with 'race_explanation:' and provide a two sentence explanation for your response.
'''

clarification = '''
Here is an additional clarifying point based in human expertise:
- Descriptions of non-white race or ethnicity must refer, in first person, to the post author - not another person.
'''

race_prompt = f'{role}{definition}{instruction}{clarification}'
#print(race_prompt)

"""#### Wave I (purposive): LLM co-annotation"""


"""#### Wave II (random): LLM co-annotation"""

# Commented out IPython magic to ensure Python compatibility.
# %pwd

# Commented out IPython magic to ensure Python compatibility.
# %cd ../inputs/annotation

d_cycle999_rnd_ss_single = pd.read_excel(
                                         'd_cycle999_rnd_ss_single.xlsx',
                                         index_col = [0],
                                         )

d_cycle999_rnd_ss_single.reset_index(
                                     drop = True,
                                     inplace = True,
                                     )

d_cycle999_rnd_ss_single.info()
d_cycle999_rnd_ss_single.head(3)

# rename sjs annotations

ss_annotations = [
                  'asp',
                  'asp_rtnl',
                  'dep',
                  'dep_rtnl',
                  'val',
                  'val_rtnl',
                  'prg',
                  'tgd',
                  'age',
                  'dbty',
                  'race',
                  ]

for s in ss_annotations:
    d_cycle999_rnd_ss_single.rename(
                                    columns = {s: s + '_ss'},
                                    inplace = True,
                                    )

d_cycle999_rnd_ss_single.info()
d_cycle999_rnd_ss_single.head(3)

# halving df to check API usage rate

d_cycle999_rnd_ss_single_01 = d_cycle999_rnd_ss_single.iloc[:600]  ### first 600 rows
d_cycle999_rnd_ss_single_02 = d_cycle999_rnd_ss_single.iloc[600:]  ### last 600 rows

d_cycle999_rnd_ss_single_01.shape
d_cycle999_rnd_ss_single_01.head(1)
d_cycle999_rnd_ss_single_02.shape
d_cycle999_rnd_ss_single_02.head(1)

# Commented out IPython magic to ensure Python compatibility.
# %pwd

#%%capture

# define prompts per tag

prompts_per_tag = {
                   'asp': [asp_prompt],
                   'dep': [dep_prompt],
                   'val': [val_prompt],
                   'prg': [prg_prompt],
                   'tgd': [tgd_prompt],
                   'age': [age_prompt],
                   'dbty': [dbty_prompt],
                   'race': [race_prompt],
                   }

# annotate df

d_cycle999_rnd_ss_gpt_dual_02 = annotate_dataframe_per_tag(
                                                        d_cycle999_rnd_ss_single_02,
                                                        prompts_per_tag,
                                                        )

d_cycle999_rnd_ss_gpt_dual_02.info()
d_cycle999_rnd_ss_gpt_dual_02.head(3)

d_cycle999_rnd_ss_gpt_dual_02.to_excel('d_cycle999_rnd_ss_gpt_dual_02.xlsx')

"""### 8. Visualize
Plots cycle-wise IAA scores.
***

**Line chart: $\kappa$ by cycle**
"""

# Commented out IPython magic to ensure Python compatibility.
# prelim: ARMHR 9/20 talk

# %cd /content/drive/My Drive/Colab/bar_policy_suicidality/outputs/tables

d_v = pd.read_excel('d_cycle_kappas.xlsx')

# Commented out IPython magic to ensure Python compatibility.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# %cd .../figures

sns.set(style = 'whitegrid')

# custom x-axis labels

custom_labels = [
                 'cycle 1',
                 'cycle 2',
                 'cycle 3',
                 'pre-GTP: cycle 4',
                 'post-GPT: cycle 4',
                 ]

# plot

plt.figure(figsize = (
                      8,
                      6,
                      )
          )

plt.plot(
         d_v['cycle'],
         d_v['asp_cohens_k'],
         label = "asp",
         marker = 's',
         alpha = 0.6,
         color = 'hotpink',
         )

plt.plot(
         d_v['cycle'],
         d_v['dep_cohens_k'],
         label = "dep",
         marker = 's',
         alpha = 0.6,
         color = 'tomato',
         )

plt.plot(
         d_v['cycle'],
         d_v['val_cohens_k'],
         label = "val",
         marker = 's',
         alpha = 0.6,
         color = 'mediumorchid',
         )

# labels, title

#plt.xlabel('Cycle')
plt.ylabel("Cohen's $\kappa$")
#plt.title("Cohen's $\kappa$ Value per Cycle for ASP, DEP, and VAL")

# custom x-axis labels, 45-degree angle

plt.xticks(
           ticks = d_v['cycle'],
           labels=custom_labels,
           rotation = 45,
           )


# horizontal gridlines

plt.grid(axis='x')

# set line at 0.7 threshold

plt.axhline(
            y = 0.7,
            color = 'red',
            linewidth = 0.6,
            linestyle = '--',
            )

# x-axis at 0

plt.ylim(
         0,
         None,
         )

# legend

plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=3, frameon=False)

# x-axis ticks

plt.gca().tick_params(axis='x', which='both', direction='in', length=5)  # Adds ticks to the x-axis labels

# optimized markers

#x0 = [2, 5, 5]
#y0 = [0.72, 0.96, 0.97]
#plt.plot(x0, y0, "s", markersize = 7, color = 'red')

#for a,b in zip(x0, y0):
#    plt.text(a, b, str(b),fontsize=9, ha='right',va='top')

sns.despine(
            left = True,
            )

# tight layout

plt.tight_layout()

# save

plt.savefig('cycle_kappa_line.png')

# display

plt.show()

"""> End of aim_i_annotate_triangulate_iaa.ipynb"""