From 9e4bb1056653949eda6adfc328c7ad2ff8700458 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Wed, 15 Jan 2025 14:27:59 -0500
Subject: [PATCH 01/21] =?UTF-8?q?successfully=20answer=202+2=3D=3F?=

---
 .gitignore                |  2 ++
 README.md                 | 18 ++++++++++++++++++
 credentials-template.yaml |  4 ++++
 q-and-a.yaml              |  1 +
 requirements.txt          |  2 ++
 run.py                    | 30 ++++++++++++++++++++++++++++++
 6 files changed, 57 insertions(+)
 create mode 100644 credentials-template.yaml
 create mode 100644 q-and-a.yaml
 create mode 100644 requirements.txt
 create mode 100644 run.py

diff --git a/.gitignore b/.gitignore
index 82f9275..77f110a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+credentials.yaml
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/README.md b/README.md
index 78109c8..65abea4 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,20 @@
 # llm-for-dp-qa
 Experiments with LLMs for Q+A about DP
+
+To get started:
+```
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+```
+
+Fill in `credentials-template.yaml` with the [key and secret for this app](https://portal.apis.huit.harvard.edu/my-apps/6dce5383-bcb6-4c9f-bd14-8f59d356b221), and then copy into place:
+```
+cp credentials-template.yaml credentials.yaml
+```
+
+You should then be able to run the queries against the API, and there should be a new output file:
+```
+run.py
+ls outputs
+```
\ No newline at end of file
diff --git a/credentials-template.yaml b/credentials-template.yaml
new file mode 100644
index 0000000..fd15e76
--- /dev/null
+++ b/credentials-template.yaml
@@ -0,0 +1,4 @@
+# Fill this in with the values at
+# https://portal.apis.huit.harvard.edu/my-apps/6dce5383-bcb6-4c9f-bd14-8f59d356b221
+key: fill-in
+secret: fill-in
\ No newline at end of file
diff --git a/q-and-a.yaml b/q-and-a.yaml
new file mode 100644
index 0000000..f4732e2
--- /dev/null
+++ b/q-and-a.yaml
@@ -0,0 +1 @@
+- What is 4+4?
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..0f2eecc
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+pyyaml
+requests
\ No newline at end of file
diff --git a/run.py b/run.py
new file mode 100644
index 0000000..dd3b33a
--- /dev/null
+++ b/run.py
@@ -0,0 +1,30 @@
+from yaml import safe_load, dump
+import requests
+from pathlib import Path
+import json
+
+if __name__ == '__main__':
+    credentials = safe_load((Path(__file__).parent / 'credentials.yaml').open())
+    key = credentials['key']
+    secret = credentials['secret']
+
+    url = "https://go.apis.huit.harvard.edu/ais-openai-direct-limited-schools/v1/chat/completions"
+    payload = json.dumps({
+        "model": "gpt-4o-mini",
+        "messages": [
+        {
+            "role": "user",
+            "content": "What is 2+2?"
+        }
+        ],
+        "temperature": 0.7
+    })
+    headers = {
+        'Content-Type': 'application/json',
+        'api-key': key
+    }
+    response = requests.request("POST", url, headers=headers, data=payload)
+    response.raise_for_status()
+    print(response.text)
+
+

From a22d1b5ca7135befc6c5ad0008099c913a8882b5 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Wed, 15 Jan 2025 14:38:23 -0500
Subject: [PATCH 02/21] factor out "ask"

---
 run.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/run.py b/run.py
index dd3b33a..9cf9ad1 100644
--- a/run.py
+++ b/run.py
@@ -3,21 +3,20 @@
 from pathlib import Path
 import json
 
-if __name__ == '__main__':
+def ask(question, model="gpt-4o-mini", temperature=0.7):
     credentials = safe_load((Path(__file__).parent / 'credentials.yaml').open())
     key = credentials['key']
-    secret = credentials['secret']
 
     url = "https://go.apis.huit.harvard.edu/ais-openai-direct-limited-schools/v1/chat/completions"
     payload = json.dumps({
-        "model": "gpt-4o-mini",
+        "model": model,
         "messages": [
         {
             "role": "user",
-            "content": "What is 2+2?"
+            "content": question
         }
         ],
-        "temperature": 0.7
+        "temperature": temperature
     })
     headers = {
         'Content-Type': 'application/json',
@@ -25,6 +24,15 @@
     }
     response = requests.request("POST", url, headers=headers, data=payload)
     response.raise_for_status()
-    print(response.text)
+    response_json = response.json()
+    answers = [
+        choice['message']['content'] for choice in response_json['choices']
+    ]
+    return answers
+
+
+if __name__ == '__main__':
+    answers = ask('What is the capital of Georgia?')
+    print(answers)
 
 

From a8920c6023854909b5e866c0e303c9f7a98ffb0e Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Wed, 15 Jan 2025 15:02:54 -0500
Subject: [PATCH 03/21] first saved output

---
 outputs/2025-01-15T15-01-22.yaml | 29 +++++++++++++++++++++++++++++
 q-and-a.yaml                     |  2 +-
 run.py                           | 25 +++++++++++++++++++------
 3 files changed, 49 insertions(+), 7 deletions(-)
 create mode 100644 outputs/2025-01-15T15-01-22.yaml

diff --git a/outputs/2025-01-15T15-01-22.yaml b/outputs/2025-01-15T15-01-22.yaml
new file mode 100644
index 0000000..1d62005
--- /dev/null
+++ b/outputs/2025-01-15T15-01-22.yaml
@@ -0,0 +1,29 @@
+- a:
+  - "Differential privacy is a mathematical framework designed to provide privacy\
+    \ guarantees when analyzing and sharing data. It aims to protect individual privacy\
+    \ while allowing for useful insights to be drawn from datasets. The key idea is\
+    \ to ensure that the output of a data analysis algorithm does not significantly\
+    \ depend on any single individual's data, making it difficult to infer whether\
+    \ any specific individual's information was included in the dataset.\n\nHere are\
+    \ the main concepts behind differential privacy:\n\n1. **Noise Addition**: To\
+    \ achieve differential privacy, random noise is added to the results of queries\
+    \ on the dataset. This noise is calibrated based on the sensitivity of the query,\
+    \ which measures how much the output could change if a single individual's data\
+    \ were included or excluded.\n\n2. **Privacy Parameter (\u03B5)**: Differential\
+    \ privacy is often quantified using a parameter \u03B5 (epsilon), which controls\
+    \ the trade-off between privacy and accuracy. A smaller \u03B5 provides stronger\
+    \ privacy guarantees but can result in less accurate results, while a larger \u03B5\
+    \ allows for more accurate results at the expense of privacy.\n\n3. **Indistinguishability**:\
+    \ The core principle of differential privacy is that an observer should not be\
+    \ able to determine whether a particular individual's data was included in the\
+    \ dataset based on the output of the analysis. This is achieved by ensuring that\
+    \ the outputs of the algorithm are statistically indistinguishable whether or\
+    \ not the individual's data is included.\n\n4. **Robustness**: Differential privacy\
+    \ provides a robust framework that can be applied to various types of data analysis,\
+    \ including statistical queries, machine learning, and data sharing.\n\nDifferential\
+    \ privacy has gained significant attention and adoption in various fields, including\
+    \ healthcare, social sciences, and technology companies, particularly for protecting\
+    \ user data in applications like Apple and Google.\n\nOverall, differential privacy\
+    \ strikes a balance between the utility of data analysis and the need to protect\
+    \ individual privacy, making it a powerful tool in the era of big data."
+  q: What is differential privacy?
diff --git a/q-and-a.yaml b/q-and-a.yaml
index f4732e2..510caf2 100644
--- a/q-and-a.yaml
+++ b/q-and-a.yaml
@@ -1 +1 @@
-- What is 4+4?
\ No newline at end of file
+- What is differential privacy?
\ No newline at end of file
diff --git a/run.py b/run.py
index 9cf9ad1..86a15ed 100644
--- a/run.py
+++ b/run.py
@@ -2,6 +2,8 @@
 import requests
 from pathlib import Path
 import json
+import re
+from datetime import datetime
 
 def ask(question, model="gpt-4o-mini", temperature=0.7):
     credentials = safe_load((Path(__file__).parent / 'credentials.yaml').open())
@@ -11,10 +13,10 @@ def ask(question, model="gpt-4o-mini", temperature=0.7):
     payload = json.dumps({
         "model": model,
         "messages": [
-        {
-            "role": "user",
-            "content": question
-        }
+            {
+                "role": "user",
+                "content": question
+            }
         ],
         "temperature": temperature
     })
@@ -32,7 +34,18 @@ def ask(question, model="gpt-4o-mini", temperature=0.7):
 
 
 if __name__ == '__main__':
-    answers = ask('What is the capital of Georgia?')
-    print(answers)
+    q_and_a_in = safe_load((Path(__file__).parent / 'q-and-a.yaml').open())
+    q_and_a_out = []
+    for question in q_and_a_in:
+        answers = ask(question)
+        q_and_a_out.append({
+            'q': question,
+            'a': answers,
+        })
+    yaml_out = dump(q_and_a_out)
+    print(yaml_out)
+    timestamp = re.sub(r'\..*', '', datetime.now().isoformat()).replace(':', '-')
+    out_path = Path(__file__).parent / 'outputs' / f'{timestamp}.yaml'
+    out_path.write_text(yaml_out)
 
 

From 963169db7bca96bacfdcad416f877e6347d995c1 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Wed, 15 Jan 2025 15:19:28 -0500
Subject: [PATCH 04/21] more readable output

---
 outputs/2025-01-15T15-18-47.yaml | 62 ++++++++++++++++++++++++++++++++
 run.py                           |  2 +-
 2 files changed, 63 insertions(+), 1 deletion(-)
 create mode 100644 outputs/2025-01-15T15-18-47.yaml

diff --git a/outputs/2025-01-15T15-18-47.yaml b/outputs/2025-01-15T15-18-47.yaml
new file mode 100644
index 0000000..15218b8
--- /dev/null
+++ b/outputs/2025-01-15T15-18-47.yaml
@@ -0,0 +1,62 @@
+- q: What is differential privacy?
+  a:
+  - 'Differential privacy is a mathematical framework designed to provide a formal
+    definition of privacy protection when analyzing and sharing data. The core idea
+    is to ensure that the inclusion or exclusion of a single individual''s data in
+    a dataset does not significantly affect the outcome of any analysis performed
+    on that dataset. This means that the results of queries or analyses conducted
+    on the data should be similar, whether or not any particular individual''s data
+    is included.
+
+
+    ### Key Concepts:
+
+
+    1. **Privacy Budget (ε)**: Differential privacy uses a parameter known as epsilon
+    (ε), which quantifies the level of privacy protection. A smaller ε indicates stronger
+    privacy guarantees, while a larger ε allows for more accurate data analysis but
+    weaker privacy.
+
+
+    2. **Noise Addition**: To achieve differential privacy, noise is typically added
+    to the results of queries on the data. This noise is often drawn from a mathematical
+    distribution (like Laplace or Gaussian) and is calibrated based on the sensitivity
+    of the query—how much the output can change with the addition or removal of a
+    single individual''s data.
+
+
+    3. **Sensitivity**: The sensitivity of a function is a measure of how much the
+    function''s output can change when a single individual''s data is added or removed.
+    Functions with lower sensitivity are generally easier to protect with differential
+    privacy.
+
+
+    4. **Indistinguishability**: The fundamental goal of differential privacy is to
+    make it difficult for an observer to determine whether any particular individual''s
+    data was included in the analysis, thus protecting individual privacy.
+
+
+    ### Applications:
+
+
+    Differential privacy has been adopted in various fields, including:
+
+
+    - **Statistics and Data Analysis**: To release aggregate statistics without compromising
+    individual privacy.
+
+    - **Machine Learning**: To train models on sensitive datasets while protecting
+    the privacy of the training data.
+
+    - **Public Data Releases**: Governments and organizations use differential privacy
+    to release census data or other sensitive information while maintaining the confidentiality
+    of individuals.
+
+
+    ### Conclusion:
+
+
+    Differential privacy provides a robust framework for balancing the needs for data
+    utility and individual privacy. By carefully managing the trade-offs between accuracy
+    and privacy, it allows organizations to analyze and share data without exposing
+    sensitive information about individuals.'
diff --git a/run.py b/run.py
index 86a15ed..49ad55c 100644
--- a/run.py
+++ b/run.py
@@ -42,7 +42,7 @@ def ask(question, model="gpt-4o-mini", temperature=0.7):
             'q': question,
             'a': answers,
         })
-    yaml_out = dump(q_and_a_out)
+    yaml_out = dump(q_and_a_out, sort_keys=False, allow_unicode=True, default_flow_style=False)
     print(yaml_out)
     timestamp = re.sub(r'\..*', '', datetime.now().isoformat()).replace(':', '-')
     out_path = Path(__file__).parent / 'outputs' / f'{timestamp}.yaml'

From 1219e73b22bab01b91f796877e34aec735e9b34b Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Wed, 15 Jan 2025 15:32:18 -0500
Subject: [PATCH 05/21] precommit black

---
 .pre-commit-config.yaml | 17 +++++++++++
 README.md               |  3 +-
 requirements.txt        |  6 +++-
 run.py                  | 68 ++++++++++++++++++++++-------------------
 4 files changed, 61 insertions(+), 33 deletions(-)
 create mode 100644 .pre-commit-config.yaml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..d07faf4
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,17 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.3.0
+    hooks:
+    -   id: check-yaml
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+  # Using this mirror lets us use mypyc-compiled black, which is about 2x faster
+  - repo: https://github.com/psf/black-pre-commit-mirror
+    rev: 24.8.0
+    hooks:
+      - id: black
+        # It is recommended to specify the latest version of Python
+        # supported by your project here, or alternatively use
+        # pre-commit's default_language_version, see
+        # https://pre-commit.com/#top_level-default_language_version
+        language_version: python3.11
diff --git a/README.md b/README.md
index 65abea4..f801d92 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@ To get started:
 python3 -m venv .venv
 source .venv/bin/activate
 pip install -r requirements.txt
+pre-commit install
 ```
 
 Fill in `credentials-template.yaml` with the [key and secret for this app](https://portal.apis.huit.harvard.edu/my-apps/6dce5383-bcb6-4c9f-bd14-8f59d356b221), and then copy into place:
@@ -17,4 +18,4 @@ You should then be able to run the queries against the API, and there should be
 ```
 run.py
 ls outputs
-```
\ No newline at end of file
+```
diff --git a/requirements.txt b/requirements.txt
index 0f2eecc..ee581c6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,6 @@
 pyyaml
-requests
\ No newline at end of file
+requests
+pre-commit
+flake8
+flake8-bugbear
+black
diff --git a/run.py b/run.py
index 49ad55c..ef003a7 100644
--- a/run.py
+++ b/run.py
@@ -5,47 +5,53 @@
 import re
 from datetime import datetime
 
-def ask(question, model="gpt-4o-mini", temperature=0.7):
-    credentials = safe_load((Path(__file__).parent / 'credentials.yaml').open())
-    key = credentials['key']
 
-    url = "https://go.apis.huit.harvard.edu/ais-openai-direct-limited-schools/v1/chat/completions"
-    payload = json.dumps({
+def get_config(model="gpt-4o-mini", temperature=0.7):
+    return {
         "model": model,
-        "messages": [
-            {
-                "role": "user",
-                "content": question
-            }
-        ],
-        "temperature": temperature
-    })
-    headers = {
-        'Content-Type': 'application/json',
-        'api-key': key
+        "temperature": temperature,
     }
+
+
+def ask(question, model, temperature):
+    credentials = safe_load((Path(__file__).parent / "credentials.yaml").open())
+    key = credentials["key"]
+
+    url = "https://go.apis.huit.harvard.edu/ais-openai-direct-limited-schools/v1/chat/completions"
+    payload = json.dumps(
+        {
+            "model": model,
+            "messages": [{"role": "user", "content": question}],
+            "temperature": temperature,
+        }
+    )
+    headers = {"Content-Type": "application/json", "api-key": key}
     response = requests.request("POST", url, headers=headers, data=payload)
     response.raise_for_status()
     response_json = response.json()
-    answers = [
-        choice['message']['content'] for choice in response_json['choices']
-    ]
+    answers = [choice["message"]["content"] for choice in response_json["choices"]]
     return answers
 
 
-if __name__ == '__main__':
-    q_and_a_in = safe_load((Path(__file__).parent / 'q-and-a.yaml').open())
+if __name__ == "__main__":
+    config = get_config()
+    q_and_a_in = safe_load((Path(__file__).parent / "q-and-a.yaml").open())
     q_and_a_out = []
     for question in q_and_a_in:
-        answers = ask(question)
-        q_and_a_out.append({
-            'q': question,
-            'a': answers,
-        })
-    yaml_out = dump(q_and_a_out, sort_keys=False, allow_unicode=True, default_flow_style=False)
+        answers = ask(question, **config)
+        q_and_a_out.append(
+            {
+                "q": question,
+                "a": answers,
+            }
+        )
+    yaml_out = dump(
+        {"config": config, "q_and_a": q_and_a_out},
+        sort_keys=False,
+        allow_unicode=True,
+        default_flow_style=False,
+    )
     print(yaml_out)
-    timestamp = re.sub(r'\..*', '', datetime.now().isoformat()).replace(':', '-')
-    out_path = Path(__file__).parent / 'outputs' / f'{timestamp}.yaml'
+    timestamp = re.sub(r"\..*", "", datetime.now().isoformat()).replace(":", "-")
+    out_path = Path(__file__).parent / "outputs" / f"{timestamp}.yaml"
     out_path.write_text(yaml_out)
-
-

From 9fc493e5d926a75e1e9745c850e38e0eeef8b46f Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Wed, 15 Jan 2025 15:44:17 -0500
Subject: [PATCH 06/21] more metadata

---
 outputs/2025-01-15T15-43-06.yaml | 13 +++++++++++++
 q-and-a.yaml                     |  2 +-
 run.py                           | 19 +++++++++++++++++--
 3 files changed, 31 insertions(+), 3 deletions(-)
 create mode 100644 outputs/2025-01-15T15-43-06.yaml

diff --git a/outputs/2025-01-15T15-43-06.yaml b/outputs/2025-01-15T15-43-06.yaml
new file mode 100644
index 0000000..7b82680
--- /dev/null
+++ b/outputs/2025-01-15T15-43-06.yaml
@@ -0,0 +1,13 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0.7
+  git_hash: 1219e73
+  datetime: '2025-01-15T15:43:06.734594'
+q_and_a:
+- q: In a few words, what is differential privacy?
+  a:
+  - Differential privacy is a technique used to ensure that the output of a database
+    query does not reveal too much information about any individual record in the
+    database, thereby protecting individual privacy while still allowing for useful
+    data analysis.
diff --git a/q-and-a.yaml b/q-and-a.yaml
index 510caf2..b6f7462 100644
--- a/q-and-a.yaml
+++ b/q-and-a.yaml
@@ -1 +1 @@
-- What is differential privacy?
\ No newline at end of file
+- In a few words, what is differential privacy?
diff --git a/run.py b/run.py
index ef003a7..9998aeb 100644
--- a/run.py
+++ b/run.py
@@ -4,6 +4,7 @@
 import json
 import re
 from datetime import datetime
+import subprocess
 
 
 def get_config(model="gpt-4o-mini", temperature=0.7):
@@ -13,6 +14,13 @@ def get_config(model="gpt-4o-mini", temperature=0.7):
     }
 
 
+def get_git_hash():
+    completed = subprocess.run(
+        "git rev-parse --short HEAD", shell=True, capture_output=True
+    )
+    return completed.stdout.decode().strip()
+
+
 def ask(question, model, temperature):
     credentials = safe_load((Path(__file__).parent / "credentials.yaml").open())
     key = credentials["key"]
@@ -45,13 +53,20 @@ def ask(question, model, temperature):
                 "a": answers,
             }
         )
+
+    datetime_now = datetime.now().isoformat()
+    metadata = {
+        "config": config,
+        "git_hash": get_git_hash(),
+        "datetime": datetime_now,
+    }
     yaml_out = dump(
-        {"config": config, "q_and_a": q_and_a_out},
+        {"metadata": metadata, "q_and_a": q_and_a_out},
         sort_keys=False,
         allow_unicode=True,
         default_flow_style=False,
     )
     print(yaml_out)
-    timestamp = re.sub(r"\..*", "", datetime.now().isoformat()).replace(":", "-")
+    timestamp = re.sub(r"\..*", "", datetime_now).replace(":", "-")
     out_path = Path(__file__).parent / "outputs" / f"{timestamp}.yaml"
     out_path.write_text(yaml_out)

From cdb186ff16086a5d098bc1e0b19246c41fc72e24 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Wed, 15 Jan 2025 16:11:01 -0500
Subject: [PATCH 07/21] copy template, then fill

---
 README.md | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index f801d92..44949e2 100644
--- a/README.md
+++ b/README.md
@@ -7,14 +7,12 @@ python3 -m venv .venv
 source .venv/bin/activate
 pip install -r requirements.txt
 pre-commit install
-```
-
-Fill in `credentials-template.yaml` with the [key and secret for this app](https://portal.apis.huit.harvard.edu/my-apps/6dce5383-bcb6-4c9f-bd14-8f59d356b221), and then copy into place:
-```
 cp credentials-template.yaml credentials.yaml
 ```
 
-You should then be able to run the queries against the API, and there should be a new output file:
+Fill in the git-ignored `credentials.yaml` with the [key and secret for this app](https://portal.apis.huit.harvard.edu/my-apps/6dce5383-bcb6-4c9f-bd14-8f59d356b221).
+
+You should then be able to run the queries against the API: The output will also be written to `outputs/`:
 ```
 run.py
 ls outputs

From 0eb6250da0ec013c684842c42c303756b5a26ea9 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Thu, 16 Jan 2025 15:59:20 -0500
Subject: [PATCH 08/21] change input format; record runtime

---
 outputs/2025-01-16T15-57-55.yaml | 22 +++++++++++++
 q-and-a.yaml                     | 11 ++++++-
 run.py                           | 55 ++++++++++++++++++++++----------
 3 files changed, 70 insertions(+), 18 deletions(-)
 create mode 100644 outputs/2025-01-16T15-57-55.yaml

diff --git a/outputs/2025-01-16T15-57-55.yaml b/outputs/2025-01-16T15-57-55.yaml
new file mode 100644
index 0000000..7c9f978
--- /dev/null
+++ b/outputs/2025-01-16T15-57-55.yaml
@@ -0,0 +1,22 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+  git_hash: cdb186f
+  datetime: '2025-01-16T15:57:55.953321'
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human answer: 'Differential privacy is a rigorous mathematical definition of privacy.
+    Consider an algorithm that analyzes a dataset and releases statistics: The algorithm
+    is differentially private if by looking at the output, you cannot tell whether
+    any individual''s data was included in the original dataset or not. Differential
+    privacy achieves this by carefully injecting random noise into the released statistics
+    to hide the effects of each individual.
+
+    '
+  llm answers:
+  - Differential privacy is a mathematical framework that ensures the privacy of individuals
+    in a dataset by adding noise to the data or query results, making it difficult
+    to identify any individual's information while still allowing for useful aggregate
+    insights.
+  runtime: '0:00:03.099645'
diff --git a/q-and-a.yaml b/q-and-a.yaml
index b6f7462..6227375 100644
--- a/q-and-a.yaml
+++ b/q-and-a.yaml
@@ -1 +1,10 @@
-- In a few words, what is differential privacy?
+-
+  Q: In a few words, what is differential privacy?
+  A: |
+    Differential privacy is a rigorous mathematical definition of privacy. Consider an algorithm that analyzes a dataset and releases statistics: The algorithm is differentially private if by looking at the output, you cannot tell whether any individual's data was included in the original dataset or not. Differential privacy achieves this by carefully injecting random noise into the released statistics to hide the effects of each individual.
+  criteria:
+    yes:
+      - Does the answer mention the rigorous mathematical definition of differential privacy?
+      - Does the answer describe adding calibrated noise to statistics?
+    no:
+      - Does the answer describe anonymizing data by stripping identifiers?
diff --git a/run.py b/run.py
index 9998aeb..b13c309 100644
--- a/run.py
+++ b/run.py
@@ -7,7 +7,7 @@
 import subprocess
 
 
-def get_config(model="gpt-4o-mini", temperature=0.7):
+def get_config(model="gpt-4o-mini", temperature=0):
     return {
         "model": model,
         "temperature": temperature,
@@ -21,11 +21,23 @@ def get_git_hash():
     return completed.stdout.decode().strip()
 
 
-def ask(question, model, temperature):
-    credentials = safe_load((Path(__file__).parent / "credentials.yaml").open())
-    key = credentials["key"]
+def get_key():
+    credentials = load_yaml("credentials.yaml")
+    return credentials["key"]
+
+
+def load_yaml(file_name):
+    return safe_load((Path(__file__).parent / file_name).open())
+
+
+api_base = "https://go.apis.huit.harvard.edu/ais-openai-direct-limited-schools/v1"
 
-    url = "https://go.apis.huit.harvard.edu/ais-openai-direct-limited-schools/v1/chat/completions"
+
+def ask(question, model, temperature):
+    headers = {
+        "Content-Type": "application/json",
+        "api-key": get_key(),
+    }
     payload = json.dumps(
         {
             "model": model,
@@ -33,32 +45,41 @@ def ask(question, model, temperature):
             "temperature": temperature,
         }
     )
-    headers = {"Content-Type": "application/json", "api-key": key}
-    response = requests.request("POST", url, headers=headers, data=payload)
+    response = requests.request(
+        method="POST",
+        url=f"{api_base}/chat/completions",
+        headers=headers,
+        data=payload,
+    )
     response.raise_for_status()
-    response_json = response.json()
-    answers = [choice["message"]["content"] for choice in response_json["choices"]]
+    answers = [choice["message"]["content"] for choice in response.json()["choices"]]
     return answers
 
 
 if __name__ == "__main__":
     config = get_config()
-    q_and_a_in = safe_load((Path(__file__).parent / "q-and-a.yaml").open())
+    q_and_a_in = load_yaml("q-and-a.yaml")
     q_and_a_out = []
-    for question in q_and_a_in:
-        answers = ask(question, **config)
+    for q_a in q_and_a_in:
+        question = q_a["Q"]
+        human_answer = q_a["A"]
+        start_time = datetime.now()
+        llm_answers = ask(question, **config)
+        end_time = datetime.now()
         q_and_a_out.append(
             {
-                "q": question,
-                "a": answers,
+                "question": question,
+                "human answer": human_answer,
+                "llm answers": llm_answers,
+                "runtime": str(end_time - start_time),
             }
         )
 
-    datetime_now = datetime.now().isoformat()
+    datetime_now = datetime.now()
     metadata = {
         "config": config,
         "git_hash": get_git_hash(),
-        "datetime": datetime_now,
+        "datetime": datetime_now.isoformat(),
     }
     yaml_out = dump(
         {"metadata": metadata, "q_and_a": q_and_a_out},
@@ -67,6 +88,6 @@ def ask(question, model, temperature):
         default_flow_style=False,
     )
     print(yaml_out)
-    timestamp = re.sub(r"\..*", "", datetime_now).replace(":", "-")
+    timestamp = re.sub(r"\..*", "", datetime_now.isoformat()).replace(":", "-")
     out_path = Path(__file__).parent / "outputs" / f"{timestamp}.yaml"
     out_path.write_text(yaml_out)

From 550587f6bd9ba159038f5a53c70d1393943216aa Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Thu, 16 Jan 2025 16:31:01 -0500
Subject: [PATCH 09/21] CLI args

---
 outputs/2025-01-16T16-29-09.yaml | 22 ++++++++++++++++
 outputs/2025-01-16T16-29-59.yaml | 23 ++++++++++++++++
 run.py                           | 45 +++++++++++++++++++++-----------
 3 files changed, 75 insertions(+), 15 deletions(-)
 create mode 100644 outputs/2025-01-16T16-29-09.yaml
 create mode 100644 outputs/2025-01-16T16-29-59.yaml
 mode change 100644 => 100755 run.py

diff --git a/outputs/2025-01-16T16-29-09.yaml b/outputs/2025-01-16T16-29-09.yaml
new file mode 100644
index 0000000..3306712
--- /dev/null
+++ b/outputs/2025-01-16T16-29-09.yaml
@@ -0,0 +1,22 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+  datetime: '2025-01-16T16:29:09.845071'
+  git_hash: 0eb6250
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human_answer: 'Differential privacy is a rigorous mathematical definition of privacy.
+    Consider an algorithm that analyzes a dataset and releases statistics: The algorithm
+    is differentially private if by looking at the output, you cannot tell whether
+    any individual''s data was included in the original dataset or not. Differential
+    privacy achieves this by carefully injecting random noise into the released statistics
+    to hide the effects of each individual.
+
+    '
+  llm_answers:
+  - Differential privacy is a mathematical framework that ensures the privacy of individuals
+    in a dataset by adding noise to the data or the results of queries, making it
+    difficult to identify any individual's information while still allowing for useful
+    aggregate insights.
+  runtime: '0:00:02.060478'
diff --git a/outputs/2025-01-16T16-29-59.yaml b/outputs/2025-01-16T16-29-59.yaml
new file mode 100644
index 0000000..92cca0f
--- /dev/null
+++ b/outputs/2025-01-16T16-29-59.yaml
@@ -0,0 +1,23 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0.99
+  datetime: '2025-01-16T16:29:59.318412'
+  git_hash: 0eb6250
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human_answer: 'Differential privacy is a rigorous mathematical definition of privacy.
+    Consider an algorithm that analyzes a dataset and releases statistics: The algorithm
+    is differentially private if by looking at the output, you cannot tell whether
+    any individual''s data was included in the original dataset or not. Differential
+    privacy achieves this by carefully injecting random noise into the released statistics
+    to hide the effects of each individual.
+
+    '
+  llm_answers:
+  - Differential privacy is a framework designed to provide strong privacy guarantees
+    when analyzing and sharing statistical data. It ensures that the inclusion or
+    exclusion of an individual's data does not significantly affect the output of
+    a function, thereby protecting individual privacy while still allowing for useful
+    insights from aggregated data.
+  runtime: '0:00:01.193081'
diff --git a/run.py b/run.py
old mode 100644
new mode 100755
index b13c309..d4dd866
--- a/run.py
+++ b/run.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 from yaml import safe_load, dump
 import requests
 from pathlib import Path
@@ -5,12 +6,17 @@
 import re
 from datetime import datetime
 import subprocess
+import argparse
 
 
-def get_config(model="gpt-4o-mini", temperature=0):
+def get_config():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default="gpt-4o-mini")
+    parser.add_argument("--temperature", type=float, default=0)
+    args = parser.parse_args()
     return {
-        "model": model,
-        "temperature": temperature,
+        "model": args.model,
+        "temperature": args.temperature,
     }
 
 
@@ -56,8 +62,7 @@ def ask(question, model, temperature):
     return answers
 
 
-if __name__ == "__main__":
-    config = get_config()
+def ask_all_questions(config):
     q_and_a_in = load_yaml("q-and-a.yaml")
     q_and_a_out = []
     for q_a in q_and_a_in:
@@ -69,25 +74,35 @@ def ask(question, model, temperature):
         q_and_a_out.append(
             {
                 "question": question,
-                "human answer": human_answer,
-                "llm answers": llm_answers,
+                "human_answer": human_answer,
+                "llm_answers": llm_answers,
                 "runtime": str(end_time - start_time),
             }
         )
+    return q_and_a_out
 
-    datetime_now = datetime.now()
-    metadata = {
-        "config": config,
-        "git_hash": get_git_hash(),
-        "datetime": datetime_now.isoformat(),
-    }
+
+def save_results(datetime_now, results):
     yaml_out = dump(
-        {"metadata": metadata, "q_and_a": q_and_a_out},
+        results,
         sort_keys=False,
         allow_unicode=True,
         default_flow_style=False,
     )
     print(yaml_out)
-    timestamp = re.sub(r"\..*", "", datetime_now.isoformat()).replace(":", "-")
+    timestamp = re.sub(r"\..*", "", datetime_now).replace(":", "-")
     out_path = Path(__file__).parent / "outputs" / f"{timestamp}.yaml"
     out_path.write_text(yaml_out)
+
+
+if __name__ == "__main__":
+    config = get_config()
+    datetime_now = datetime.now().isoformat()
+    metadata = {
+        "config": config,
+        "datetime": datetime_now,
+        "git_hash": get_git_hash(),
+    }
+    q_and_a = ask_all_questions(config)
+    results = {"metadata": metadata, "q_and_a": q_and_a}
+    save_results(datetime_now, results)

From a25fc613f15cc959649477ecfdb50053483f6529 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Thu, 16 Jan 2025 17:46:53 -0500
Subject: [PATCH 10/21] self-evaluation

---
 outputs/2025-01-16T17-42-52.yaml | 37 ++++++++++++++++
 q-and-a.yaml                     | 11 ++---
 run.py                           | 76 +++++++++++++++++++++++++++-----
 3 files changed, 109 insertions(+), 15 deletions(-)
 create mode 100644 outputs/2025-01-16T17-42-52.yaml

diff --git a/outputs/2025-01-16T17-42-52.yaml b/outputs/2025-01-16T17-42-52.yaml
new file mode 100644
index 0000000..ced4c2d
--- /dev/null
+++ b/outputs/2025-01-16T17-42-52.yaml
@@ -0,0 +1,37 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+  datetime: '2025-01-16T17:42:52.450949'
+  git_hash: 550587f
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human_answers:
+  - evalution: Does the answer mention the rigorous mathematical definition of differential
+      privacy?
+    expected: true
+    actual:
+    - 'Yes'
+  - evalution: Does the answer describe adding calibrated noise to statistics?
+    expected: true
+    actual:
+    - Yes.
+  - evalution: Does the answer describe anonymizing data by stripping identifiers?
+    expected: false
+    actual:
+    - 'No'
+  llm_answers:
+  - evalution: Does the answer mention the rigorous mathematical definition of differential
+      privacy?
+    expected: true
+    actual:
+    - 'No'
+  - evalution: Does the answer describe adding calibrated noise to statistics?
+    expected: true
+    actual:
+    - Yes.
+  - evalution: Does the answer describe anonymizing data by stripping identifiers?
+    expected: false
+    actual:
+    - 'No'
+  runtime: '0:00:01.043781'
diff --git a/q-and-a.yaml b/q-and-a.yaml
index 6227375..255fd4e 100644
--- a/q-and-a.yaml
+++ b/q-and-a.yaml
@@ -1,10 +1,11 @@
 -
   Q: In a few words, what is differential privacy?
-  A: |
-    Differential privacy is a rigorous mathematical definition of privacy. Consider an algorithm that analyzes a dataset and releases statistics: The algorithm is differentially private if by looking at the output, you cannot tell whether any individual's data was included in the original dataset or not. Differential privacy achieves this by carefully injecting random noise into the released statistics to hide the effects of each individual.
-  criteria:
-    yes:
+  A:
+    - |
+      Differential privacy is a rigorous mathematical definition of privacy. Consider an algorithm that analyzes a dataset and releases statistics: The algorithm is differentially private if by looking at the output, you cannot tell whether any individual's data was included in the original dataset or not. Differential privacy achieves this by carefully injecting random noise into the released statistics to hide the effects of each individual.
+  evaluation:
+    true:
       - Does the answer mention the rigorous mathematical definition of differential privacy?
       - Does the answer describe adding calibrated noise to statistics?
-    no:
+    false:
       - Does the answer describe anonymizing data by stripping identifiers?
diff --git a/run.py b/run.py
index d4dd866..f1b0ae0 100755
--- a/run.py
+++ b/run.py
@@ -7,6 +7,7 @@
 from datetime import datetime
 import subprocess
 import argparse
+import logging
 
 
 def get_config():
@@ -39,7 +40,8 @@ def load_yaml(file_name):
 api_base = "https://go.apis.huit.harvard.edu/ais-openai-direct-limited-schools/v1"
 
 
-def ask(question, model, temperature):
+def ask_one_question(question, model, temperature):
+    logging.info(f"Q: {question}")
     headers = {
         "Content-Type": "application/json",
         "api-key": get_key(),
@@ -51,15 +53,40 @@ def ask(question, model, temperature):
             "temperature": temperature,
         }
     )
+    start_time = datetime.now()
     response = requests.request(
         method="POST",
         url=f"{api_base}/chat/completions",
         headers=headers,
         data=payload,
     )
+    end_time = datetime.now()
     response.raise_for_status()
     answers = [choice["message"]["content"] for choice in response.json()["choices"]]
-    return answers
+    for answer in answers:
+        logging.info(f"A: {answer}")
+    return answers, (end_time - start_time)
+
+
+def ask_evaluation(question, answer, evaluation):
+    # For the evaluation, we want boring, reliable answers,
+    # even as we change the parameters for the primary query.
+    # Might surface these as a separate config at some point.
+    model = "gpt-4o-mini"
+    temperature = 0
+    question_answer_evaluation = f"""First, read the following question and answer pair:
+
+Question: {question}
+
+Answer:
+'''
+{answer}
+'''
+
+Considering the response above, answer the following question with "yes" or "no":
+{evaluation}
+"""
+    return ask_one_question(question_answer_evaluation, model, temperature)
 
 
 def ask_all_questions(config):
@@ -67,16 +94,45 @@ def ask_all_questions(config):
     q_and_a_out = []
     for q_a in q_and_a_in:
         question = q_a["Q"]
-        human_answer = q_a["A"]
-        start_time = datetime.now()
-        llm_answers = ask(question, **config)
-        end_time = datetime.now()
+        human_answers = q_a["A"]
+        human_answers_evaluation = []
+        for answer in human_answers:
+            breakpoint()
+            evaluation_true = q_a["evaluation"][True]
+            for evaluation in evaluation_true:
+                response, _runtime = ask_evaluation(question, answer, evaluation)
+                human_answers_evaluation.append(
+                    {"evalution": evaluation, "expected": True, "actual": response}
+                )
+            evaluation_false = q_a["evaluation"][False]
+            for evaluation in evaluation_false:
+                response, _runtime = ask_evaluation(question, answer, evaluation)
+                human_answers_evaluation.append(
+                    {"evalution": evaluation, "expected": False, "actual": response}
+                )
+
+        llm_answers, runtime = ask_one_question(question, **config)
+        llm_answers_evaluation = []
+        for answer in llm_answers:
+            evaluation_true = q_a["evaluation"][True]
+            for evaluation in evaluation_true:
+                response, _runtime = ask_evaluation(question, answer, evaluation)
+                llm_answers_evaluation.append(
+                    {"evalution": evaluation, "expected": True, "actual": response}
+                )
+            evaluation_false = q_a["evaluation"][False]
+            for evaluation in evaluation_false:
+                response, _runtime = ask_evaluation(question, answer, evaluation)
+                llm_answers_evaluation.append(
+                    {"evalution": evaluation, "expected": False, "actual": response}
+                )
+
         q_and_a_out.append(
             {
                 "question": question,
-                "human_answer": human_answer,
-                "llm_answers": llm_answers,
-                "runtime": str(end_time - start_time),
+                "human_answers": human_answers_evaluation,
+                "llm_answers": llm_answers_evaluation,
+                "runtime": str(runtime),
             }
         )
     return q_and_a_out
@@ -89,13 +145,13 @@ def save_results(datetime_now, results):
         allow_unicode=True,
         default_flow_style=False,
     )
-    print(yaml_out)
     timestamp = re.sub(r"\..*", "", datetime_now).replace(":", "-")
     out_path = Path(__file__).parent / "outputs" / f"{timestamp}.yaml"
     out_path.write_text(yaml_out)
 
 
 if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
     config = get_config()
     datetime_now = datetime.now().isoformat()
     metadata = {

From a0793a2babfecf47eee72df429646a53e45c91d5 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Thu, 16 Jan 2025 18:18:47 -0500
Subject: [PATCH 11/21] include answers in output

---
 outputs/2025-01-16T18-14-39.yaml | 49 +++++++++++++++++++++++++++++
 q-and-a.yaml                     |  2 +-
 run.py                           | 53 +++++++++++++++-----------------
 3 files changed, 74 insertions(+), 30 deletions(-)
 create mode 100644 outputs/2025-01-16T18-14-39.yaml

diff --git a/outputs/2025-01-16T18-14-39.yaml b/outputs/2025-01-16T18-14-39.yaml
new file mode 100644
index 0000000..3c270a4
--- /dev/null
+++ b/outputs/2025-01-16T18-14-39.yaml
@@ -0,0 +1,49 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+  datetime: '2025-01-16T18:14:39.582095'
+  git_hash: a25fc61
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human:
+    ? 'Differential privacy is a rigorous mathematical definition of privacy. Consider
+      an algorithm that analyzes a dataset and releases statistics: The algorithm
+      is differentially private if by looking at the output, you cannot tell whether
+      any individual''s data was included in the original dataset or not. Differential
+      privacy achieves this by carefully injecting random noise into the released
+      statistics to hide the effects of each individual.
+
+      '
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual:
+        - 'Yes'
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual:
+        - Yes.
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual:
+        - 'No'
+  llm:
+    ? Differential privacy is a mathematical framework that ensures the privacy of
+      individuals in a dataset by adding noise to the data or the query results, making
+      it difficult to identify any individual's information while still allowing for
+      useful aggregate insights.
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual:
+        - 'No'
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual:
+        - Yes.
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual:
+        - 'No'
+  runtime: '0:00:01.491051'
diff --git a/q-and-a.yaml b/q-and-a.yaml
index 255fd4e..33e43b2 100644
--- a/q-and-a.yaml
+++ b/q-and-a.yaml
@@ -3,7 +3,7 @@
   A:
     - |
       Differential privacy is a rigorous mathematical definition of privacy. Consider an algorithm that analyzes a dataset and releases statistics: The algorithm is differentially private if by looking at the output, you cannot tell whether any individual's data was included in the original dataset or not. Differential privacy achieves this by carefully injecting random noise into the released statistics to hide the effects of each individual.
-  evaluation:
+  evaluations:
     true:
       - Does the answer mention the rigorous mathematical definition of differential privacy?
       - Does the answer describe adding calibrated noise to statistics?
diff --git a/run.py b/run.py
index f1b0ae0..29ff705 100755
--- a/run.py
+++ b/run.py
@@ -89,49 +89,44 @@ def ask_evaluation(question, answer, evaluation):
     return ask_one_question(question_answer_evaluation, model, temperature)
 
 
+def evaluate(question, answer, evaluations_in):
+    evaluations_out = []
+    for expected in [True, False]:
+        for evaluation in evaluations_in[expected]:
+            actual, _runtime = ask_evaluation(question, answer, evaluation)
+            evaluations_out.append(
+                {
+                    "evalution": evaluation,
+                    "expected": expected,
+                    "actual": actual,
+                }
+            )
+    return evaluations_out
+
+
 def ask_all_questions(config):
     q_and_a_in = load_yaml("q-and-a.yaml")
     q_and_a_out = []
     for q_a in q_and_a_in:
         question = q_a["Q"]
+
         human_answers = q_a["A"]
-        human_answers_evaluation = []
+        human_answers_evaluation = {}
         for answer in human_answers:
-            breakpoint()
-            evaluation_true = q_a["evaluation"][True]
-            for evaluation in evaluation_true:
-                response, _runtime = ask_evaluation(question, answer, evaluation)
-                human_answers_evaluation.append(
-                    {"evalution": evaluation, "expected": True, "actual": response}
-                )
-            evaluation_false = q_a["evaluation"][False]
-            for evaluation in evaluation_false:
-                response, _runtime = ask_evaluation(question, answer, evaluation)
-                human_answers_evaluation.append(
-                    {"evalution": evaluation, "expected": False, "actual": response}
-                )
+            evaluation = evaluate(question, answer, q_a["evaluations"])
+            human_answers_evaluation[answer] = evaluation
 
         llm_answers, runtime = ask_one_question(question, **config)
-        llm_answers_evaluation = []
+        llm_answers_evaluation = {}
         for answer in llm_answers:
-            evaluation_true = q_a["evaluation"][True]
-            for evaluation in evaluation_true:
-                response, _runtime = ask_evaluation(question, answer, evaluation)
-                llm_answers_evaluation.append(
-                    {"evalution": evaluation, "expected": True, "actual": response}
-                )
-            evaluation_false = q_a["evaluation"][False]
-            for evaluation in evaluation_false:
-                response, _runtime = ask_evaluation(question, answer, evaluation)
-                llm_answers_evaluation.append(
-                    {"evalution": evaluation, "expected": False, "actual": response}
-                )
+            evaluation = evaluate(question, answer, q_a["evaluations"])
+            llm_answers_evaluation[answer] = evaluation
 
         q_and_a_out.append(
             {
                 "question": question,
-                "human_answers": human_answers_evaluation,
-                "llm_answers": llm_answers_evaluation,
+                "human": human_answers_evaluation,
+                "llm": llm_answers_evaluation,
                 "runtime": str(runtime),
             }
         )

From 3ef7bdd465c7ee8f2485622ddcf1ca142e7c4900 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Fri, 17 Jan 2025 10:14:24 -0500
Subject: [PATCH 12/21] pre-prompt

---
 outputs/2025-01-17T10-12-30.yaml | 50 ++++++++++++++++++++++++++++++++
 run.py                           | 42 +++++++++++++++------------
 2 files changed, 74 insertions(+), 18 deletions(-)
 create mode 100644 outputs/2025-01-17T10-12-30.yaml

diff --git a/outputs/2025-01-17T10-12-30.yaml b/outputs/2025-01-17T10-12-30.yaml
new file mode 100644
index 0000000..3acab07
--- /dev/null
+++ b/outputs/2025-01-17T10-12-30.yaml
@@ -0,0 +1,50 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+    pre: ''
+  datetime: '2025-01-17T10:12:30.943123'
+  git_hash: a0793a2
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human:
+    ? 'Differential privacy is a rigorous mathematical definition of privacy. Consider
+      an algorithm that analyzes a dataset and releases statistics: The algorithm
+      is differentially private if by looking at the output, you cannot tell whether
+      any individual''s data was included in the original dataset or not. Differential
+      privacy achieves this by carefully injecting random noise into the released
+      statistics to hide the effects of each individual.
+
+      '
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual:
+        - Yes.
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual:
+        - Yes.
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual:
+        - 'No'
+  llm:
+    ? Differential privacy is a mathematical framework that ensures the privacy of
+      individuals in a dataset by adding noise to the data or the results of queries,
+      making it difficult to identify any individual's information while still allowing
+      for useful aggregate insights.
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual:
+        - 'No'
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual:
+        - Yes.
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual:
+        - 'No'
+  runtime: '0:00:01.027454'
diff --git a/run.py b/run.py
index 29ff705..2dcce55 100755
--- a/run.py
+++ b/run.py
@@ -14,11 +14,9 @@ def get_config():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model", default="gpt-4o-mini")
     parser.add_argument("--temperature", type=float, default=0)
+    parser.add_argument("--pre", default="")
     args = parser.parse_args()
-    return {
-        "model": args.model,
-        "temperature": args.temperature,
-    }
+    return {"model": args.model, "temperature": args.temperature, "pre": args.pre}
 
 
 def get_git_hash():
@@ -40,7 +38,7 @@ def load_yaml(file_name):
 api_base = "https://go.apis.huit.harvard.edu/ais-openai-direct-limited-schools/v1"
 
 
-def ask_one_question(question, model, temperature):
+def ask_one_question(question, model, temperature, pre):
     logging.info(f"Q: {question}")
     headers = {
         "Content-Type": "application/json",
@@ -49,7 +47,7 @@ def ask_one_question(question, model, temperature):
     payload = json.dumps(
         {
             "model": model,
-            "messages": [{"role": "user", "content": question}],
+            "messages": [{"role": "user", "content": f"{pre}\n{question}"}],
             "temperature": temperature,
         }
     )
@@ -74,6 +72,7 @@ def ask_evaluation(question, answer, evaluation):
     # Might surface these as a separate config at some point.
     model = "gpt-4o-mini"
     temperature = 0
+    pre = ""
     question_answer_evaluation = f"""First, read the following question and answer pair:
 
 Question: {question}
@@ -86,10 +85,10 @@ def ask_evaluation(question, answer, evaluation):
 Considering the response above, answer the following question with "yes" or "no":
 {evaluation}
 """
-    return ask_one_question(question_answer_evaluation, model, temperature)
+    return ask_one_question(question_answer_evaluation, model, temperature, pre)
 
 
-def evaluate(question, answer, evaluations_in):
+def evaluate_one_answer(question, answer, evaluations_in):
     evaluations_out = []
     for expected in [True, False]:
         for evaluation in evaluations_in[expected]:
@@ -104,29 +103,36 @@ def evaluate(question, answer, evaluations_in):
     return evaluations_out
 
 
+def evaluate_all_answers(question, answers, evaluation_questions):
+    evaluation_answers = {}
+    for answer in answers:
+        evaluation = evaluate_one_answer(question, answer, evaluation_questions)
+        evaluation_answers[answer] = evaluation
+    return evaluation_answers
+
+
 def ask_all_questions(config):
     q_and_a_in = load_yaml("q-and-a.yaml")
     q_and_a_out = []
     for q_a in q_and_a_in:
         question = q_a["Q"]
+        evaluations = q_a["evaluations"]
 
         human_answers = q_a["A"]
-        human_answers_evaluation = {}
-        for answer in human_answers:
-            evaluation = evaluate(question, answer, q_a["evaluations"])
-            human_answers_evaluation[answer] = evaluation
+        human_answers_evaluations = evaluate_all_answers(
+            question, human_answers, evaluations
+        )
 
         llm_answers, runtime = ask_one_question(question, **config)
-        llm_answers_evaluation = {}
-        for answer in llm_answers:
-            evaluation = evaluate(question, answer, q_a["evaluations"])
-            llm_answers_evaluation[answer] = evaluation
+        llm_answers_evaluations = evaluate_all_answers(
+            question, llm_answers, evaluations
+        )
 
         q_and_a_out.append(
             {
                 "question": question,
-                "human": human_answers_evaluation,
-                "llm": llm_answers_evaluation,
+                "human": human_answers_evaluations,
+                "llm": llm_answers_evaluations,
                 "runtime": str(runtime),
             }
         )

From 57b44ea9d7e33069024e6b9fa0b079fa747c27c9 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Fri, 17 Jan 2025 10:56:53 -0500
Subject: [PATCH 13/21] use openai library, add param for system

---
 outputs/2025-01-17T10-55-36.yaml | 49 ++++++++++++++++++++++++++++++
 requirements.txt                 |  2 +-
 run.py                           | 52 +++++++++++++++-----------------
 3 files changed, 74 insertions(+), 29 deletions(-)
 create mode 100644 outputs/2025-01-17T10-55-36.yaml

diff --git a/outputs/2025-01-17T10-55-36.yaml b/outputs/2025-01-17T10-55-36.yaml
new file mode 100644
index 0000000..4988bdc
--- /dev/null
+++ b/outputs/2025-01-17T10-55-36.yaml
@@ -0,0 +1,49 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+    system: You are an expert answering questions for a five year old.
+  datetime: '2025-01-17T10:55:36.164764'
+  git_hash: 3ef7bdd
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human:
+    ? 'Differential privacy is a rigorous mathematical definition of privacy. Consider
+      an algorithm that analyzes a dataset and releases statistics: The algorithm
+      is differentially private if by looking at the output, you cannot tell whether
+      any individual''s data was included in the original dataset or not. Differential
+      privacy achieves this by carefully injecting random noise into the released
+      statistics to hide the effects of each individual.
+
+      '
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual:
+        - 'Yes'
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual:
+        - Yes.
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual:
+        - 'No'
+  llm:
+    ? Differential privacy is like a magic cloak that helps keep your secrets safe
+      when sharing information. It makes sure that even if someone looks at the data,
+      they can't tell anything special about you!
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual:
+        - 'No'
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual:
+        - 'No'
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual:
+        - 'No'
+  runtime: '0:00:01.270520'
diff --git a/requirements.txt b/requirements.txt
index ee581c6..b91f6da 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 pyyaml
-requests
 pre-commit
 flake8
 flake8-bugbear
 black
+openai
diff --git a/run.py b/run.py
index 2dcce55..5ee0857 100755
--- a/run.py
+++ b/run.py
@@ -1,22 +1,25 @@
 #!/usr/bin/env python3
 from yaml import safe_load, dump
-import requests
 from pathlib import Path
-import json
 import re
 from datetime import datetime
 import subprocess
 import argparse
 import logging
+from openai import OpenAI
 
 
 def get_config():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model", default="gpt-4o-mini")
     parser.add_argument("--temperature", type=float, default=0)
-    parser.add_argument("--pre", default="")
+    parser.add_argument("--system")
     args = parser.parse_args()
-    return {"model": args.model, "temperature": args.temperature, "pre": args.pre}
+    return {
+        "model": args.model,
+        "temperature": args.temperature,
+        "system": args.system,
+    }
 
 
 def get_git_hash():
@@ -35,32 +38,25 @@ def load_yaml(file_name):
     return safe_load((Path(__file__).parent / file_name).open())
 
 
-api_base = "https://go.apis.huit.harvard.edu/ais-openai-direct-limited-schools/v1"
-
-
-def ask_one_question(question, model, temperature, pre):
+def ask_one_question(question, model, temperature, system):
     logging.info(f"Q: {question}")
-    headers = {
-        "Content-Type": "application/json",
-        "api-key": get_key(),
-    }
-    payload = json.dumps(
-        {
-            "model": model,
-            "messages": [{"role": "user", "content": f"{pre}\n{question}"}],
-            "temperature": temperature,
-        }
-    )
+
     start_time = datetime.now()
-    response = requests.request(
-        method="POST",
-        url=f"{api_base}/chat/completions",
-        headers=headers,
-        data=payload,
+    client = OpenAI(
+        base_url="https://go.apis.huit.harvard.edu/ais-openai-direct-limited-schools/v1",
+        api_key=get_key(),
+    )
+    messages = [{"role": "user", "content": question}]
+    if system:
+        messages.append({"role": "system", "content": system})
+    completions = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        temperature=temperature,
     )
     end_time = datetime.now()
-    response.raise_for_status()
-    answers = [choice["message"]["content"] for choice in response.json()["choices"]]
+
+    answers = [choice.message.content for choice in completions.choices]
     for answer in answers:
         logging.info(f"A: {answer}")
     return answers, (end_time - start_time)
@@ -72,7 +68,7 @@ def ask_evaluation(question, answer, evaluation):
     # Might surface these as a separate config at some point.
     model = "gpt-4o-mini"
     temperature = 0
-    pre = ""
+    system = None
     question_answer_evaluation = f"""First, read the following question and answer pair:
 
 Question: {question}
@@ -85,7 +81,7 @@ def ask_evaluation(question, answer, evaluation):
 Considering the response above, answer the following question with "yes" or "no":
 {evaluation}
 """
-    return ask_one_question(question_answer_evaluation, model, temperature, pre)
+    return ask_one_question(question_answer_evaluation, model, temperature, system)
 
 
 def evaluate_one_answer(question, answer, evaluations_in):

From eae75e84387652dd801524cead8a9243d6a31814 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Fri, 17 Jan 2025 11:33:02 -0500
Subject: [PATCH 14/21] Boolean responses!

---
 outputs/2025-01-17T11-12-04.yaml | 47 ++++++++++++++++++++++++++++++
 outputs/2025-01-17T11-25-46.yaml | 49 ++++++++++++++++++++++++++++++++
 outputs/2025-01-17T11-30-16.yaml | 43 ++++++++++++++++++++++++++++
 run.py                           | 24 +++++++++++++---
 4 files changed, 159 insertions(+), 4 deletions(-)
 create mode 100644 outputs/2025-01-17T11-12-04.yaml
 create mode 100644 outputs/2025-01-17T11-25-46.yaml
 create mode 100644 outputs/2025-01-17T11-30-16.yaml

diff --git a/outputs/2025-01-17T11-12-04.yaml b/outputs/2025-01-17T11-12-04.yaml
new file mode 100644
index 0000000..dfb35d3
--- /dev/null
+++ b/outputs/2025-01-17T11-12-04.yaml
@@ -0,0 +1,47 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+    system: You are an expert answering questions for a five year old.
+  datetime: '2025-01-17T11:12:04.698731'
+  git_hash: 57b44ea
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human:
+    ? 'Differential privacy is a rigorous mathematical definition of privacy. Consider
+      an algorithm that analyzes a dataset and releases statistics: The algorithm
+      is differentially private if by looking at the output, you cannot tell whether
+      any individual''s data was included in the original dataset or not. Differential
+      privacy achieves this by carefully injecting random noise into the released
+      statistics to hide the effects of each individual.
+
+      '
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual:
+        - '{"answer_satisfies_criteria":true}'
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual:
+        - '{"answer_satisfies_criteria":true}'
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual:
+        - '{"answer_satisfies_criteria":false}'
+  llm:
+    '{"answer_satisfies_criteria":true}':
+    - evalution: Does the answer mention the rigorous mathematical definition of differential
+        privacy?
+      expected: true
+      actual:
+      - '{"answer_satisfies_criteria":false}'
+    - evalution: Does the answer describe adding calibrated noise to statistics?
+      expected: true
+      actual:
+      - '{"answer_satisfies_criteria":false}'
+    - evalution: Does the answer describe anonymizing data by stripping identifiers?
+      expected: false
+      actual:
+      - '{"answer_satisfies_criteria":false}'
+  runtime: '0:00:00.551525'
diff --git a/outputs/2025-01-17T11-25-46.yaml b/outputs/2025-01-17T11-25-46.yaml
new file mode 100644
index 0000000..43930d5
--- /dev/null
+++ b/outputs/2025-01-17T11-25-46.yaml
@@ -0,0 +1,49 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+    system: You are an expert answering questions for a five year old.
+  datetime: '2025-01-17T11:25:46.187147'
+  git_hash: 57b44ea
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human:
+    ? 'Differential privacy is a rigorous mathematical definition of privacy. Consider
+      an algorithm that analyzes a dataset and releases statistics: The algorithm
+      is differentially private if by looking at the output, you cannot tell whether
+      any individual''s data was included in the original dataset or not. Differential
+      privacy achieves this by carefully injecting random noise into the released
+      statistics to hide the effects of each individual.
+
+      '
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual:
+        - 'Yes'
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual:
+        - 'Yes'
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual:
+        - 'No'
+  llm:
+    ? Differential privacy is like a magic cloak that helps keep your secrets safe
+      when sharing information. It makes sure that even if someone looks at the data,
+      they can't tell anything special about you!
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual:
+        - 'No'
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual:
+        - 'No'
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual:
+        - 'No'
+  runtime: '0:00:01.013878'
diff --git a/outputs/2025-01-17T11-30-16.yaml b/outputs/2025-01-17T11-30-16.yaml
new file mode 100644
index 0000000..763a070
--- /dev/null
+++ b/outputs/2025-01-17T11-30-16.yaml
@@ -0,0 +1,43 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+    system: You are an expert answering questions for a five year old.
+  datetime: '2025-01-17T11:30:16.015934'
+  git_hash: 57b44ea
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human:
+    ? 'Differential privacy is a rigorous mathematical definition of privacy. Consider
+      an algorithm that analyzes a dataset and releases statistics: The algorithm
+      is differentially private if by looking at the output, you cannot tell whether
+      any individual''s data was included in the original dataset or not. Differential
+      privacy achieves this by carefully injecting random noise into the released
+      statistics to hide the effects of each individual.
+
+      '
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual: true
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual: true
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual: false
+  llm:
+    ? Differential privacy is like a magic cloak that helps keep your secrets safe
+      when sharing information. It makes sure that even if someone looks at the data,
+      they can't tell anything special about you!
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual: false
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual: false
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual: false
+  runtime: '0:00:00.987728'
diff --git a/run.py b/run.py
index 5ee0857..90d09a9 100755
--- a/run.py
+++ b/run.py
@@ -6,7 +6,9 @@
 import subprocess
 import argparse
 import logging
-from openai import OpenAI
+from openai import OpenAI, NOT_GIVEN
+from pydantic import BaseModel
+import json
 
 
 def get_config():
@@ -38,7 +40,11 @@ def load_yaml(file_name):
     return safe_load((Path(__file__).parent / file_name).open())
 
 
-def ask_one_question(question, model, temperature, system):
+class Evaluation(BaseModel):
+    answer_satisfies_criteria: bool
+
+
+def ask_one_question(question, model, temperature, system, is_evaluation=False):
     logging.info(f"Q: {question}")
 
     start_time = datetime.now()
@@ -49,16 +55,24 @@ def ask_one_question(question, model, temperature, system):
     messages = [{"role": "user", "content": question}]
     if system:
         messages.append({"role": "system", "content": system})
-    completions = client.chat.completions.create(
+    # Trying to use "client.chat.completions.create" produced an error:
+    # > You tried to pass a `BaseModel` class to `chat.completions.create()`;
+    # > You must use `beta.chat.completions.parse()` instead
+    completions = client.beta.chat.completions.parse(
         messages=messages,
         model=model,
         temperature=temperature,
+        response_format=Evaluation if is_evaluation else NOT_GIVEN,
     )
     end_time = datetime.now()
 
     answers = [choice.message.content for choice in completions.choices]
     for answer in answers:
         logging.info(f"A: {answer}")
+    if is_evaluation:
+        answers = [
+            json.loads(answer)["answer_satisfies_criteria"] for answer in answers
+        ][0]
     return answers, (end_time - start_time)
 
 
@@ -81,7 +95,9 @@ def ask_evaluation(question, answer, evaluation):
 Considering the response above, answer the following question with "yes" or "no":
 {evaluation}
 """
-    return ask_one_question(question_answer_evaluation, model, temperature, system)
+    return ask_one_question(
+        question_answer_evaluation, model, temperature, system, is_evaluation=True
+    )
 
 
 def evaluate_one_answer(question, answer, evaluations_in):

From 5226b7520900c875849dd05959bf9db8472f7d18 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Fri, 17 Jan 2025 12:14:39 -0500
Subject: [PATCH 15/21] add scores!

---
 outputs/2025-01-17T12-13-41.yaml | 46 ++++++++++++++++++++++++++++++++
 run.py                           | 17 ++++++++++--
 2 files changed, 61 insertions(+), 2 deletions(-)
 create mode 100644 outputs/2025-01-17T12-13-41.yaml

diff --git a/outputs/2025-01-17T12-13-41.yaml b/outputs/2025-01-17T12-13-41.yaml
new file mode 100644
index 0000000..4fb25ee
--- /dev/null
+++ b/outputs/2025-01-17T12-13-41.yaml
@@ -0,0 +1,46 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+    system: You are an expert answering questions for a five year old.
+  datetime: '2025-01-17T12:13:41.387801'
+  git_hash: eae75e8
+scores:
+  human: 3 / 3
+  llm: 1 / 3
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human:
+    ? 'Differential privacy is a rigorous mathematical definition of privacy. Consider
+      an algorithm that analyzes a dataset and releases statistics: The algorithm
+      is differentially private if by looking at the output, you cannot tell whether
+      any individual''s data was included in the original dataset or not. Differential
+      privacy achieves this by carefully injecting random noise into the released
+      statistics to hide the effects of each individual.
+
+      '
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual: true
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual: true
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual: false
+  llm:
+    ? Differential privacy is like a magic cloak that helps keep your secrets safe
+      when sharing information. It makes sure that even if someone looks at the data,
+      they can't tell anything special about you!
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual: false
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual: false
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual: false
+  runtime: '0:00:01.046167'
diff --git a/run.py b/run.py
index 90d09a9..578bcc9 100755
--- a/run.py
+++ b/run.py
@@ -92,7 +92,7 @@ def ask_evaluation(question, answer, evaluation):
 {answer}
 '''
 
-Considering the response above, answer the following question with "yes" or "no":
+Considering the response above, answer the following question with "True" or "False":
 {evaluation}
 """
     return ask_one_question(
@@ -151,6 +151,18 @@ def ask_all_questions(config):
     return q_and_a_out
 
 
+def get_scores(q_and_a):
+    scores = {}
+    for human_llm in q_and_a:
+        for agent in ["human", "llm"]:
+            evaluations = human_llm[agent].values()
+            flat_list = [e for e_list in evaluations for e in e_list]
+            total = len(flat_list)
+            correct = sum(1 for e in flat_list if e["expected"] == e["actual"])
+            scores[agent] = f"{correct} / {total}"
+    return scores
+
+
 def save_results(datetime_now, results):
     yaml_out = dump(
         results,
@@ -173,5 +185,6 @@ def save_results(datetime_now, results):
         "git_hash": get_git_hash(),
     }
     q_and_a = ask_all_questions(config)
-    results = {"metadata": metadata, "q_and_a": q_and_a}
+    scores = get_scores(q_and_a)
+    results = {"metadata": metadata, "scores": scores, "q_and_a": q_and_a}
     save_results(datetime_now, results)

From ee8dcba2451d07c190fa5c6e11786541d4d014f6 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Fri, 17 Jan 2025 13:23:28 -0500
Subject: [PATCH 16/21] add ci.sh

---
 .flake8            | 12 ++++++++++++
 .pytest.ini        | 10 ++++++++++
 ci.sh              |  5 +++++
 requirements.txt   |  2 ++
 tests/test_misc.py | 14 ++++++++++++++
 5 files changed, 43 insertions(+)
 create mode 100644 .flake8
 create mode 100644 .pytest.ini
 create mode 100755 ci.sh
 create mode 100644 tests/test_misc.py

diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..88d4d31
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,12 @@
+[flake8]
+exclude = .git,.venv,__pycache__
+
+# Config recommended by black:
+# https://black.readthedocs.io/en/stable/guides/using_black_with_other_tools.html#bugbear
+max-line-length = 80
+extend-select = B950
+extend-ignore = E203,E501,E701
+
+; per-file-ignores =
+;     # Ignore undefined names in templates.
+;     */code_generators/no-tests/*.py:F821,F401,E302
diff --git a/.pytest.ini b/.pytest.ini
new file mode 100644
index 0000000..6236693
--- /dev/null
+++ b/.pytest.ini
@@ -0,0 +1,10 @@
+[pytest]
+
+# Treat warnings as errors:
+filterwarnings =
+	error
+
+addopts = --doctest-glob '*.md' --doctest-modules
+
+# If an xfail starts passing unexpectedly, that should count as a failure:
+xfail_strict=true
diff --git a/ci.sh b/ci.sh
new file mode 100755
index 0000000..6b85cbb
--- /dev/null
+++ b/ci.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+set -euo pipefail
+
+pytest -vv --failed-first
diff --git a/requirements.txt b/requirements.txt
index b91f6da..9f59b2f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,5 @@ flake8
 flake8-bugbear
 black
 openai
+pytest
+pyright
diff --git a/tests/test_misc.py b/tests/test_misc.py
new file mode 100644
index 0000000..dee4ba9
--- /dev/null
+++ b/tests/test_misc.py
@@ -0,0 +1,14 @@
+import subprocess
+import pytest
+
+
+tests = {
+    "flake8 linting": "flake8 . --count --show-source --statistics",
+    "pyright type checking": "pyright",
+}
+
+
+@pytest.mark.parametrize("cmd", tests.values(), ids=tests.keys())
+def test_subprocess(cmd: str):
+    result = subprocess.run(cmd, shell=True)
+    assert result.returncode == 0, f'"{cmd}" failed'

From 23d7f62a3427a8c9ab5f7d0f289ffa1e1e31ad9b Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Fri, 17 Jan 2025 13:25:26 -0500
Subject: [PATCH 17/21] add workflow: Expect failure

---
 .github/workflows/test.yml | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 .github/workflows/test.yml

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..2c79661
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,31 @@
+name: Test
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        python-version:
+          # - '3.9'
+          - '3.12'
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dev dependencies
+        run: pip install -r requirements.txt
+
+      - name: Test
+        run: ./ci.sh

From 7c5cc9e007b21782e352393644bdf5f0ba3dd0ae Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Fri, 17 Jan 2025 13:38:09 -0500
Subject: [PATCH 18/21] add one ignore and it passes

---
 run.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/run.py b/run.py
index 578bcc9..98cb468 100755
--- a/run.py
+++ b/run.py
@@ -59,7 +59,7 @@ def ask_one_question(question, model, temperature, system, is_evaluation=False):
     # > You tried to pass a `BaseModel` class to `chat.completions.create()`;
     # > You must use `beta.chat.completions.parse()` instead
     completions = client.beta.chat.completions.parse(
-        messages=messages,
+        messages=messages,  # type: ignore
         model=model,
         temperature=temperature,
         response_format=Evaluation if is_evaluation else NOT_GIVEN,
@@ -71,7 +71,9 @@ def ask_one_question(question, model, temperature, system, is_evaluation=False):
         logging.info(f"A: {answer}")
     if is_evaluation:
         answers = [
-            json.loads(answer)["answer_satisfies_criteria"] for answer in answers
+            json.loads(answer)["answer_satisfies_criteria"]
+            for answer in answers
+            if answer
         ][0]
     return answers, (end_time - start_time)
 

From 0e78c645fe6ecff39b23355cf825cb8cc60488ba Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Fri, 17 Jan 2025 14:51:54 -0500
Subject: [PATCH 19/21] move the script down into directory; fix score
 reporting

---
 README.md                                     |   6 +-
 llm_for_dp_qa/__init__.py                     |   0
 .../credentials-template.yaml                 |   2 +-
 .../outputs}/2025-01-15T15-01-22.yaml         |   0
 .../outputs}/2025-01-15T15-18-47.yaml         |   0
 .../outputs}/2025-01-15T15-43-06.yaml         |   0
 .../outputs}/2025-01-16T15-57-55.yaml         |   0
 .../outputs}/2025-01-16T16-29-09.yaml         |   0
 .../outputs}/2025-01-16T16-29-59.yaml         |   0
 .../outputs}/2025-01-16T17-42-52.yaml         |   0
 .../outputs}/2025-01-16T18-14-39.yaml         |   0
 .../outputs}/2025-01-17T10-12-30.yaml         |   0
 .../outputs}/2025-01-17T10-55-36.yaml         |   0
 .../outputs}/2025-01-17T11-12-04.yaml         |   0
 .../outputs}/2025-01-17T11-25-46.yaml         |   0
 .../outputs}/2025-01-17T11-30-16.yaml         |   0
 .../outputs}/2025-01-17T12-13-41.yaml         |   0
 .../outputs/2025-01-17T13-58-38.yaml          | 187 ++++++++++++++++++
 .../outputs/2025-01-17T14-49-17.yaml          | 179 +++++++++++++++++
 llm_for_dp_qa/q-and-a.yaml                    |  29 +++
 run.py => llm_for_dp_qa/run.py                |  14 +-
 q-and-a.yaml                                  |  11 --
 tests/test_run.py                             |  39 ++++
 23 files changed, 446 insertions(+), 21 deletions(-)
 create mode 100644 llm_for_dp_qa/__init__.py
 rename credentials-template.yaml => llm_for_dp_qa/credentials-template.yaml (89%)
 rename {outputs => llm_for_dp_qa/outputs}/2025-01-15T15-01-22.yaml (100%)
 rename {outputs => llm_for_dp_qa/outputs}/2025-01-15T15-18-47.yaml (100%)
 rename {outputs => llm_for_dp_qa/outputs}/2025-01-15T15-43-06.yaml (100%)
 rename {outputs => llm_for_dp_qa/outputs}/2025-01-16T15-57-55.yaml (100%)
 rename {outputs => llm_for_dp_qa/outputs}/2025-01-16T16-29-09.yaml (100%)
 rename {outputs => llm_for_dp_qa/outputs}/2025-01-16T16-29-59.yaml (100%)
 rename {outputs => llm_for_dp_qa/outputs}/2025-01-16T17-42-52.yaml (100%)
 rename {outputs => llm_for_dp_qa/outputs}/2025-01-16T18-14-39.yaml (100%)
 rename {outputs => llm_for_dp_qa/outputs}/2025-01-17T10-12-30.yaml (100%)
 rename {outputs => llm_for_dp_qa/outputs}/2025-01-17T10-55-36.yaml (100%)
 rename {outputs => llm_for_dp_qa/outputs}/2025-01-17T11-12-04.yaml (100%)
 rename {outputs => llm_for_dp_qa/outputs}/2025-01-17T11-25-46.yaml (100%)
 rename {outputs => llm_for_dp_qa/outputs}/2025-01-17T11-30-16.yaml (100%)
 rename {outputs => llm_for_dp_qa/outputs}/2025-01-17T12-13-41.yaml (100%)
 create mode 100644 llm_for_dp_qa/outputs/2025-01-17T13-58-38.yaml
 create mode 100644 llm_for_dp_qa/outputs/2025-01-17T14-49-17.yaml
 create mode 100644 llm_for_dp_qa/q-and-a.yaml
 rename run.py => llm_for_dp_qa/run.py (93%)
 delete mode 100644 q-and-a.yaml
 create mode 100644 tests/test_run.py

diff --git a/README.md b/README.md
index 44949e2..9ab5d15 100644
--- a/README.md
+++ b/README.md
@@ -7,13 +7,13 @@ python3 -m venv .venv
 source .venv/bin/activate
 pip install -r requirements.txt
 pre-commit install
-cp credentials-template.yaml credentials.yaml
+cp llm_for_dp_qa/credentials{-template,}.yaml
 ```
 
 Fill in the git-ignored `credentials.yaml` with the [key and secret for this app](https://portal.apis.huit.harvard.edu/my-apps/6dce5383-bcb6-4c9f-bd14-8f59d356b221).
 
 You should then be able to run the queries against the API: The output will also be written to `outputs/`:
 ```
-run.py
-ls outputs
+llm_for_dp_qa/run.py
+ls llm_for_dp_qa/outputs
 ```
diff --git a/llm_for_dp_qa/__init__.py b/llm_for_dp_qa/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/credentials-template.yaml b/llm_for_dp_qa/credentials-template.yaml
similarity index 89%
rename from credentials-template.yaml
rename to llm_for_dp_qa/credentials-template.yaml
index fd15e76..7ea2e57 100644
--- a/credentials-template.yaml
+++ b/llm_for_dp_qa/credentials-template.yaml
@@ -1,4 +1,4 @@
 # Fill this in with the values at
 # https://portal.apis.huit.harvard.edu/my-apps/6dce5383-bcb6-4c9f-bd14-8f59d356b221
 key: fill-in
-secret: fill-in
\ No newline at end of file
+secret: fill-in
diff --git a/outputs/2025-01-15T15-01-22.yaml b/llm_for_dp_qa/outputs/2025-01-15T15-01-22.yaml
similarity index 100%
rename from outputs/2025-01-15T15-01-22.yaml
rename to llm_for_dp_qa/outputs/2025-01-15T15-01-22.yaml
diff --git a/outputs/2025-01-15T15-18-47.yaml b/llm_for_dp_qa/outputs/2025-01-15T15-18-47.yaml
similarity index 100%
rename from outputs/2025-01-15T15-18-47.yaml
rename to llm_for_dp_qa/outputs/2025-01-15T15-18-47.yaml
diff --git a/outputs/2025-01-15T15-43-06.yaml b/llm_for_dp_qa/outputs/2025-01-15T15-43-06.yaml
similarity index 100%
rename from outputs/2025-01-15T15-43-06.yaml
rename to llm_for_dp_qa/outputs/2025-01-15T15-43-06.yaml
diff --git a/outputs/2025-01-16T15-57-55.yaml b/llm_for_dp_qa/outputs/2025-01-16T15-57-55.yaml
similarity index 100%
rename from outputs/2025-01-16T15-57-55.yaml
rename to llm_for_dp_qa/outputs/2025-01-16T15-57-55.yaml
diff --git a/outputs/2025-01-16T16-29-09.yaml b/llm_for_dp_qa/outputs/2025-01-16T16-29-09.yaml
similarity index 100%
rename from outputs/2025-01-16T16-29-09.yaml
rename to llm_for_dp_qa/outputs/2025-01-16T16-29-09.yaml
diff --git a/outputs/2025-01-16T16-29-59.yaml b/llm_for_dp_qa/outputs/2025-01-16T16-29-59.yaml
similarity index 100%
rename from outputs/2025-01-16T16-29-59.yaml
rename to llm_for_dp_qa/outputs/2025-01-16T16-29-59.yaml
diff --git a/outputs/2025-01-16T17-42-52.yaml b/llm_for_dp_qa/outputs/2025-01-16T17-42-52.yaml
similarity index 100%
rename from outputs/2025-01-16T17-42-52.yaml
rename to llm_for_dp_qa/outputs/2025-01-16T17-42-52.yaml
diff --git a/outputs/2025-01-16T18-14-39.yaml b/llm_for_dp_qa/outputs/2025-01-16T18-14-39.yaml
similarity index 100%
rename from outputs/2025-01-16T18-14-39.yaml
rename to llm_for_dp_qa/outputs/2025-01-16T18-14-39.yaml
diff --git a/outputs/2025-01-17T10-12-30.yaml b/llm_for_dp_qa/outputs/2025-01-17T10-12-30.yaml
similarity index 100%
rename from outputs/2025-01-17T10-12-30.yaml
rename to llm_for_dp_qa/outputs/2025-01-17T10-12-30.yaml
diff --git a/outputs/2025-01-17T10-55-36.yaml b/llm_for_dp_qa/outputs/2025-01-17T10-55-36.yaml
similarity index 100%
rename from outputs/2025-01-17T10-55-36.yaml
rename to llm_for_dp_qa/outputs/2025-01-17T10-55-36.yaml
diff --git a/outputs/2025-01-17T11-12-04.yaml b/llm_for_dp_qa/outputs/2025-01-17T11-12-04.yaml
similarity index 100%
rename from outputs/2025-01-17T11-12-04.yaml
rename to llm_for_dp_qa/outputs/2025-01-17T11-12-04.yaml
diff --git a/outputs/2025-01-17T11-25-46.yaml b/llm_for_dp_qa/outputs/2025-01-17T11-25-46.yaml
similarity index 100%
rename from outputs/2025-01-17T11-25-46.yaml
rename to llm_for_dp_qa/outputs/2025-01-17T11-25-46.yaml
diff --git a/outputs/2025-01-17T11-30-16.yaml b/llm_for_dp_qa/outputs/2025-01-17T11-30-16.yaml
similarity index 100%
rename from outputs/2025-01-17T11-30-16.yaml
rename to llm_for_dp_qa/outputs/2025-01-17T11-30-16.yaml
diff --git a/outputs/2025-01-17T12-13-41.yaml b/llm_for_dp_qa/outputs/2025-01-17T12-13-41.yaml
similarity index 100%
rename from outputs/2025-01-17T12-13-41.yaml
rename to llm_for_dp_qa/outputs/2025-01-17T12-13-41.yaml
diff --git a/llm_for_dp_qa/outputs/2025-01-17T13-58-38.yaml b/llm_for_dp_qa/outputs/2025-01-17T13-58-38.yaml
new file mode 100644
index 0000000..577fa39
--- /dev/null
+++ b/llm_for_dp_qa/outputs/2025-01-17T13-58-38.yaml
@@ -0,0 +1,187 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+    system: null
+  datetime: '2025-01-17T13:58:38.016689'
+  git_hash: 7c5cc9e
+scores:
+  human: 3 / 3
+  llm: 0 / 3
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human:
+    ? 'Differential privacy is a rigorous mathematical definition of privacy. Consider
+      an algorithm that analyzes a dataset and releases statistics: The algorithm
+      is differentially private if by looking at the output, you cannot tell whether
+      any individual''s data was included in the original dataset or not. Differential
+      privacy achieves this by carefully injecting random noise into the released
+      statistics to hide the effects of each individual.
+
+      '
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual: true
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual: true
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual: false
+  llm:
+    ? Differential privacy is a mathematical framework that ensures the privacy of
+      individuals in a dataset by adding noise to the data or the query results, making
+      it difficult to identify any individual's information while still allowing for
+      useful aggregate insights.
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual: false
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual: true
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual: false
+  runtime: '0:00:01.747258'
+- question: 'I have an extremely useless question about the Bernoulli sampler used
+    in OpenDP. I have been working on implementing a the same idea for another language,
+    and was using the implementation in OpenDP as a reference, and I noticed something
+    about line 120:
+
+    ```
+
+    i if i == leading_zeros => !prob.raw_exponent().is_zero()
+
+    ```
+
+    This is the case where Geo(1/2)=leading_zeros, given that we are dealing with
+    valid probabilities in (0,1), this line will only ever return false for subnormal
+    floats, but for some subnormal floats, such as the floatmin(Float64), this changes
+    their probability from minuscule to 0. I''ve also seen that the largest subnormal
+    (2^-1022)(1-2^-52), has probability 1.11e-308 of returning true in the current
+    state versus 2.225e-308 (which is closer to its real value) if this line was just
+    "return true". Now I don''t think this change should be made, because it biases
+    subnormals slightly more towards true , which seems worse than undershooting,
+    but I was curious to know if there''s a better way to handle the subnormal case
+    in my version.
+
+    '
+  human:
+    ? "I'm not quite following why for FloatMin(float64) this implementation always\
+      \ returns false. For such subnormals, there is some (small) probability that\
+      \ first_heads_index lands in the part of the binary expansion represented by\
+      \ the mantissa bits (line 124). In particular, line 120 is considering the case\
+      \ that first_heads_index lands on the bit in the binary expansion that is represented\
+      \ by the implicit bit, but I think I'm missing how this biases the probability\
+      \ to always be 0 \n"
+    : - evalution: Does the answer reflect some confusion about the question?
+        expected: true
+        actual: true
+      - evalution: Does the answer mention that there is some (small) probability
+          that first_heads_index lands in the part of the binary expansion represented
+          by the mantissa bits?
+        expected: true
+        actual: true
+      - evalution: Does the answer point out that line 120 is considering the case
+          that first_heads_index lands on the bit in the binary expansion that is
+          represented by the implicit bit?
+        expected: true
+        actual: true
+  llm:
+    ? 'Your question about the handling of subnormal floats in the Bernoulli sampler
+      is quite insightful, and it touches on an important aspect of numerical stability
+      and precision in probabilistic algorithms.
+
+
+      In the context of the Bernoulli sampler, the line you referenced is indeed a
+      way to handle the edge case of very small probabilities, particularly subnormal
+      floats. The intention behind this line is to ensure that the sampler behaves
+      correctly for probabilities that are extremely close to zero, while also avoiding
+      potential pitfalls associated with floating-point precision.
+
+
+      ### Understanding the Issue
+
+
+      1. **Subnormal Floats**: Subnormal (or denormal) floats are used to represent
+      numbers that are very close to zero but not exactly zero. They allow for gradual
+      underflow, which can be important in maintaining precision in calculations involving
+      very small numbers.
+
+
+      2. **Biasing Towards True**: The line in question effectively introduces a condition
+      that can lead to a slight bias in favor of returning `true` for certain subnormal
+      values. This can be seen as a way to avoid the situation where a very small
+      probability is treated as zero, which would lead to a complete loss of information
+      about the event.
+
+
+      3. **Precision vs. Bias**: As you noted, this approach can lead to a situation
+      where the probability of returning `true` for certain subnormal values is higher
+      than it would be if the line simply returned `true` for all cases. This can
+      introduce a bias, but it also helps to ensure that the sampler does not completely
+      ignore very small probabilities.
+
+
+      ### Possible Solutions
+
+
+      If you''re looking for a better way to handle subnormal cases in your implementation,
+      here are a few strategies you might consider:
+
+
+      1. **Thresholding**: Introduce a small threshold value (e.g., `epsilon`) below
+      which you treat probabilities as zero. This way, you can avoid the complications
+      of subnormal floats while still maintaining a clear distinction between very
+      small probabilities and zero.
+
+
+      2. **Custom Handling for Subnormals**: Instead of the current logic, you could
+      implement a specific case for subnormal floats. For example, you could check
+      if the probability is subnormal and handle it differently, perhaps by using
+      a different sampling strategy or adjusting the probability slightly to avoid
+      the pitfalls of floating-point precision.
+
+
+      3. **Logarithmic Representation**: Consider using a logarithmic representation
+      for probabilities. This can help mitigate issues with very small values, as
+      you can work with log-probabilities instead. When sampling, you can convert
+      back to the original scale.
+
+
+      4. **Monte Carlo Sampling**: If the probabilities are extremely small, you might
+      consider using a Monte Carlo approach where you sample multiple times and aggregate
+      results, which can help smooth out the effects of subnormal values.
+
+
+      5. **Documentation and Testing**: Regardless of the approach you choose, ensure
+      that you document the behavior of your implementation clearly, especially regarding
+      how it handles edge cases like subnormal floats. Additionally, thorough testing
+      with a range of probability values, including subnormals, will help you understand
+      the implications of your design choices.
+
+
+      ### Conclusion
+
+
+      Ultimately, the choice of how to handle subnormal floats in your Bernoulli sampler
+      will depend on the specific requirements of your application and the trade-offs
+      you are willing to make between precision and bias. It''s a nuanced decision,
+      and careful consideration of the implications of each approach will help you
+      arrive at a solution that meets your needs.'
+    : - evalution: Does the answer reflect some confusion about the question?
+        expected: true
+        actual: false
+      - evalution: Does the answer mention that there is some (small) probability
+          that first_heads_index lands in the part of the binary expansion represented
+          by the mantissa bits?
+        expected: true
+        actual: false
+      - evalution: Does the answer point out that line 120 is considering the case
+          that first_heads_index lands on the bit in the binary expansion that is
+          represented by the implicit bit?
+        expected: true
+        actual: false
+  runtime: '0:00:12.800528'
diff --git a/llm_for_dp_qa/outputs/2025-01-17T14-49-17.yaml b/llm_for_dp_qa/outputs/2025-01-17T14-49-17.yaml
new file mode 100644
index 0000000..5493eb1
--- /dev/null
+++ b/llm_for_dp_qa/outputs/2025-01-17T14-49-17.yaml
@@ -0,0 +1,179 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+    system: null
+  datetime: '2025-01-17T14:49:17.021489'
+  git_hash: 7c5cc9e
+scores:
+  human: 6 / 6
+  llm: 2 / 6
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human:
+    ? 'Differential privacy is a rigorous mathematical definition of privacy. Consider
+      an algorithm that analyzes a dataset and releases statistics: The algorithm
+      is differentially private if by looking at the output, you cannot tell whether
+      any individual''s data was included in the original dataset or not. Differential
+      privacy achieves this by carefully injecting random noise into the released
+      statistics to hide the effects of each individual.
+
+      '
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual: true
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual: true
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual: false
+  llm:
+    ? Differential privacy is a mathematical framework that ensures the privacy of
+      individuals in a dataset by adding noise to the data or the query results, making
+      it difficult to identify any individual's information while still allowing for
+      useful aggregate insights.
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual: false
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual: true
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual: false
+  runtime: '0:00:01.030129'
+- question: 'I have an extremely useless question about the Bernoulli sampler used
+    in OpenDP. I have been working on implementing a the same idea for another language,
+    and was using the implementation in OpenDP as a reference, and I noticed something
+    about line 120:
+
+    ```
+
+    i if i == leading_zeros => !prob.raw_exponent().is_zero()
+
+    ```
+
+    This is the case where Geo(1/2)=leading_zeros, given that we are dealing with
+    valid probabilities in (0,1), this line will only ever return false for subnormal
+    floats, but for some subnormal floats, such as the floatmin(Float64), this changes
+    their probability from minuscule to 0. I''ve also seen that the largest subnormal
+    (2^-1022)(1-2^-52), has probability 1.11e-308 of returning true in the current
+    state versus 2.225e-308 (which is closer to its real value) if this line was just
+    "return true". Now I don''t think this change should be made, because it biases
+    subnormals slightly more towards true , which seems worse than undershooting,
+    but I was curious to know if there''s a better way to handle the subnormal case
+    in my version.
+
+    '
+  human:
+    ? "I'm not quite following why for FloatMin(float64) this implementation always\
+      \ returns false. For such subnormals, there is some (small) probability that\
+      \ first_heads_index lands in the part of the binary expansion represented by\
+      \ the mantissa bits (line 124). In particular, line 120 is considering the case\
+      \ that first_heads_index lands on the bit in the binary expansion that is represented\
+      \ by the implicit bit, but I think I'm missing how this biases the probability\
+      \ to always be 0 \n"
+    : - evalution: Does the answer reflect some confusion about the question?
+        expected: true
+        actual: true
+      - evalution: Does the answer mention that there is some (small) probability
+          that first_heads_index lands in the part of the binary expansion represented
+          by the mantissa bits?
+        expected: true
+        actual: true
+      - evalution: Does the answer point out that line 120 is considering the case
+          that first_heads_index lands on the bit in the binary expansion that is
+          represented by the implicit bit?
+        expected: true
+        actual: true
+  llm:
+    ? 'Your question about the handling of subnormal floats in the Bernoulli sampler
+      is quite insightful, and it touches on an important aspect of numerical stability
+      and precision in probabilistic algorithms.
+
+
+      In the context of the Bernoulli sampler, the line you referenced is indeed a
+      way to handle the edge case of very small probabilities, particularly subnormal
+      floats. The intention behind this line is to ensure that the sampler behaves
+      correctly even when dealing with probabilities that are extremely close to zero.
+
+
+      ### Understanding the Issue
+
+
+      1. **Subnormal Floats**: Subnormal (or denormal) floats are numbers that are
+      too small to be represented in the normal floating-point format. They allow
+      for representation of values closer to zero than the smallest normal float,
+      but they come with reduced precision. This can lead to unexpected behavior in
+      probabilistic algorithms if not handled carefully.
+
+
+      2. **Biasing Towards True**: The line in question effectively introduces a condition
+      that can lead to a bias in favor of returning `true` for certain subnormal values.
+      This can be problematic because it may skew the intended probabilities, especially
+      if the goal is to maintain a uniform sampling distribution.
+
+
+      ### Possible Solutions
+
+
+      Here are a few strategies you might consider for handling subnormal floats in
+      your implementation:
+
+
+      1. **Thresholding**: Instead of using the raw exponent to determine the behavior
+      for subnormal floats, you could introduce a threshold. For example, if the probability
+      is below a certain small value (e.g., `Float64::min_positive()`), you could
+      treat it as zero. This way, you avoid the bias introduced by the current implementation.
+
+
+      2. **Direct Comparison**: You could directly compare the probability against
+      a small epsilon value. If the probability is less than this epsilon, return
+      `false`. This would ensure that very small probabilities are treated as zero
+      without introducing bias.
+
+
+      3. **Logarithmic Approach**: Consider using logarithmic probabilities. Instead
+      of working directly with probabilities, you could work with their logarithms.
+      This can help avoid issues with underflow and provide a more stable way to handle
+      very small values.
+
+
+      4. **Custom Handling for Subnormals**: If you want to maintain the behavior
+      of the original implementation but avoid the bias, you could implement a custom
+      handling mechanism specifically for subnormal floats. For instance, you could
+      check if the probability is subnormal and apply a different logic that maintains
+      the intended distribution.
+
+
+      5. **Testing and Validation**: Regardless of the approach you choose, it''s
+      crucial to validate the behavior of your implementation through extensive testing.
+      Ensure that the sampling distribution remains consistent with the expected probabilities,
+      especially in edge cases involving subnormal floats.
+
+
+      ### Conclusion
+
+
+      The handling of subnormal floats in probabilistic algorithms is a nuanced issue
+      that requires careful consideration of both numerical stability and the intended
+      behavior of the algorithm. By implementing one of the strategies above, you
+      can mitigate the potential biases introduced by the original line of code while
+      still maintaining the integrity of the sampling process.'
+    : - evalution: Does the answer reflect some confusion about the question?
+        expected: true
+        actual: false
+      - evalution: Does the answer mention that there is some (small) probability
+          that first_heads_index lands in the part of the binary expansion represented
+          by the mantissa bits?
+        expected: true
+        actual: false
+      - evalution: Does the answer point out that line 120 is considering the case
+          that first_heads_index lands on the bit in the binary expansion that is
+          represented by the implicit bit?
+        expected: true
+        actual: false
+  runtime: '0:00:09.587726'
diff --git a/llm_for_dp_qa/q-and-a.yaml b/llm_for_dp_qa/q-and-a.yaml
new file mode 100644
index 0000000..312d45b
--- /dev/null
+++ b/llm_for_dp_qa/q-and-a.yaml
@@ -0,0 +1,29 @@
+-
+  Q: In a few words, what is differential privacy?
+  A:
+    - |
+      Differential privacy is a rigorous mathematical definition of privacy. Consider an algorithm that analyzes a dataset and releases statistics: The algorithm is differentially private if by looking at the output, you cannot tell whether any individual's data was included in the original dataset or not. Differential privacy achieves this by carefully injecting random noise into the released statistics to hide the effects of each individual.
+  evaluations:
+    true:
+      - Does the answer mention the rigorous mathematical definition of differential privacy?
+      - Does the answer describe adding calibrated noise to statistics?
+    false:
+      - Does the answer describe anonymizing data by stripping identifiers?
+-
+  # https://opendp.slack.com/archives/C02L4CMC1LZ/p1723183400008439
+  # Tweaked version of question from Apigail Gentle:
+  Q: |
+    I have an extremely useless question about the Bernoulli sampler used in OpenDP. I have been working on implementing a the same idea for another language, and was using the implementation in OpenDP as a reference, and I noticed something about line 120:
+    ```
+    i if i == leading_zeros => !prob.raw_exponent().is_zero()
+    ```
+    This is the case where Geo(1/2)=leading_zeros, given that we are dealing with valid probabilities in (0,1), this line will only ever return false for subnormal floats, but for some subnormal floats, such as the floatmin(Float64), this changes their probability from minuscule to 0. I've also seen that the largest subnormal (2^-1022)(1-2^-52), has probability 1.11e-308 of returning true in the current state versus 2.225e-308 (which is closer to its real value) if this line was just "return true". Now I don't think this change should be made, because it biases subnormals slightly more towards true , which seems worse than undershooting, but I was curious to know if there's a better way to handle the subnormal case in my version.
+  # From Zachary Ratliff:
+  A:
+    - |
+      I'm not quite following why for FloatMin(float64) this implementation always returns false. For such subnormals, there is some (small) probability that first_heads_index lands in the part of the binary expansion represented by the mantissa bits (line 124). In particular, line 120 is considering the case that first_heads_index lands on the bit in the binary expansion that is represented by the implicit bit, but I think I'm missing how this biases the probability to always be 0
+  evaluations:
+    true:
+      - Does the answer reflect some confusion about the question?
+      - Does the answer mention that there is some (small) probability that first_heads_index lands in the part of the binary expansion represented by the mantissa bits?
+      - Does the answer point out that line 120 is considering the case that first_heads_index lands on the bit in the binary expansion that is represented by the implicit bit?
diff --git a/run.py b/llm_for_dp_qa/run.py
similarity index 93%
rename from run.py
rename to llm_for_dp_qa/run.py
index 98cb468..d826cea 100755
--- a/run.py
+++ b/llm_for_dp_qa/run.py
@@ -9,6 +9,7 @@
 from openai import OpenAI, NOT_GIVEN
 from pydantic import BaseModel
 import json
+from collections import defaultdict
 
 
 def get_config():
@@ -105,7 +106,7 @@ def ask_evaluation(question, answer, evaluation):
 def evaluate_one_answer(question, answer, evaluations_in):
     evaluations_out = []
     for expected in [True, False]:
-        for evaluation in evaluations_in[expected]:
+        for evaluation in evaluations_in.get(expected, []):
             actual, _runtime = ask_evaluation(question, answer, evaluation)
             evaluations_out.append(
                 {
@@ -154,15 +155,16 @@ def ask_all_questions(config):
 
 
 def get_scores(q_and_a):
-    scores = {}
+    correct = defaultdict(int)
+    total = defaultdict(int)
     for human_llm in q_and_a:
         for agent in ["human", "llm"]:
             evaluations = human_llm[agent].values()
             flat_list = [e for e_list in evaluations for e in e_list]
-            total = len(flat_list)
-            correct = sum(1 for e in flat_list if e["expected"] == e["actual"])
-            scores[agent] = f"{correct} / {total}"
-    return scores
+            total[agent] += len(flat_list)
+            correct[agent] += sum(1 for e in flat_list if e["expected"] == e["actual"])
+
+    return {agent: f"{correct[agent]} / {total[agent]}" for agent in ["human", "llm"]}
 
 
 def save_results(datetime_now, results):
diff --git a/q-and-a.yaml b/q-and-a.yaml
deleted file mode 100644
index 33e43b2..0000000
--- a/q-and-a.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
--
-  Q: In a few words, what is differential privacy?
-  A:
-    - |
-      Differential privacy is a rigorous mathematical definition of privacy. Consider an algorithm that analyzes a dataset and releases statistics: The algorithm is differentially private if by looking at the output, you cannot tell whether any individual's data was included in the original dataset or not. Differential privacy achieves this by carefully injecting random noise into the released statistics to hide the effects of each individual.
-  evaluations:
-    true:
-      - Does the answer mention the rigorous mathematical definition of differential privacy?
-      - Does the answer describe adding calibrated noise to statistics?
-    false:
-      - Does the answer describe anonymizing data by stripping identifiers?
diff --git a/tests/test_run.py b/tests/test_run.py
new file mode 100644
index 0000000..ed5a220
--- /dev/null
+++ b/tests/test_run.py
@@ -0,0 +1,39 @@
+from llm_for_dp_qa.run import get_scores
+
+
+def test_get_scores():
+    q_and_a = [
+        {
+            "human": {
+                "q1": [
+                    {"expected": True, "actual": True},
+                    {"expected": True, "actual": True},
+                ]
+            },
+            "llm": {
+                "q1": [
+                    {"expected": True, "actual": False},
+                    {"expected": True, "actual": False},
+                ]
+            },
+        },
+        {
+            "human": {
+                "q1": [
+                    {"expected": False, "actual": False},
+                ],
+                "q2": [
+                    {"expected": False, "actual": False},
+                ],
+            },
+            "llm": {
+                "q1": [
+                    {"expected": False, "actual": True},
+                ],
+                "q2": [
+                    {"expected": False, "actual": False},  # Let it get one right.
+                ],
+            },
+        },
+    ]
+    assert get_scores(q_and_a) == {"human": "4 / 4", "llm": "1 / 4"}

From a80572b69181fb613cd96855ac21b418d5f40874 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccallucc@gmail.com>
Date: Fri, 17 Jan 2025 14:59:58 -0500
Subject: [PATCH 20/21] clean up q+a example

---
 llm_for_dp_qa/q-and-a.yaml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llm_for_dp_qa/q-and-a.yaml b/llm_for_dp_qa/q-and-a.yaml
index 312d45b..1a9e164 100644
--- a/llm_for_dp_qa/q-and-a.yaml
+++ b/llm_for_dp_qa/q-and-a.yaml
@@ -10,16 +10,17 @@
     false:
       - Does the answer describe anonymizing data by stripping identifiers?
 -
+  # OpenDP slack, #random channel, August 9, 2024
   # https://opendp.slack.com/archives/C02L4CMC1LZ/p1723183400008439
-  # Tweaked version of question from Apigail Gentle:
+  # Tweaked question from Apigail Gentle:
   Q: |
-    I have an extremely useless question about the Bernoulli sampler used in OpenDP. I have been working on implementing a the same idea for another language, and was using the implementation in OpenDP as a reference, and I noticed something about line 120:
+    I have a question about the Bernoulli sampler used in OpenDP. I have been working on implementing a the same idea for another language, and was using the implementation in OpenDP as a reference, and I noticed something about line 120:
     ```
     i if i == leading_zeros => !prob.raw_exponent().is_zero()
     ```
     This is the case where Geo(1/2)=leading_zeros, given that we are dealing with valid probabilities in (0,1), this line will only ever return false for subnormal floats, but for some subnormal floats, such as the floatmin(Float64), this changes their probability from minuscule to 0. I've also seen that the largest subnormal (2^-1022)(1-2^-52), has probability 1.11e-308 of returning true in the current state versus 2.225e-308 (which is closer to its real value) if this line was just "return true". Now I don't think this change should be made, because it biases subnormals slightly more towards true , which seems worse than undershooting, but I was curious to know if there's a better way to handle the subnormal case in my version.
-  # From Zachary Ratliff:
   A:
+    # Tweaked response from Zachary Ratliff:
     - |
       I'm not quite following why for FloatMin(float64) this implementation always returns false. For such subnormals, there is some (small) probability that first_heads_index lands in the part of the binary expansion represented by the mantissa bits (line 124). In particular, line 120 is considering the case that first_heads_index lands on the bit in the binary expansion that is represented by the implicit bit, but I think I'm missing how this biases the probability to always be 0
   evaluations:

From 3e4e3c235e2e19121f3d16f7b48c250891bb5447 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccalluc@users.noreply.github.com>
Date: Tue, 21 Jan 2025 12:38:32 -0500
Subject: [PATCH 21/21] Update .flake8

---
 .flake8 | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.flake8 b/.flake8
index 88d4d31..1518df6 100644
--- a/.flake8
+++ b/.flake8
@@ -6,7 +6,3 @@ exclude = .git,.venv,__pycache__
 max-line-length = 80
 extend-select = B950
 extend-ignore = E203,E501,E701
-
-; per-file-ignores =
-;     # Ignore undefined names in templates.
-;     */code_generators/no-tests/*.py:F821,F401,E302