diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..1518df6
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,8 @@
+[flake8]
+exclude = .git,.venv,__pycache__
+
+# Config recommended by black:
+# https://black.readthedocs.io/en/stable/guides/using_black_with_other_tools.html#bugbear
+max-line-length = 80
+extend-select = B950
+extend-ignore = E203,E501,E701
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..2c79661
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,31 @@
+name: Test
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        python-version:
+          # - '3.9'
+          - '3.12'
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dev dependencies
+        run: pip install -r requirements.txt
+
+      - name: Test
+        run: ./ci.sh
diff --git a/.gitignore b/.gitignore
index 82f9275..77f110a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+credentials.yaml
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..d07faf4
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,17 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.3.0
+    hooks:
+    -   id: check-yaml
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+  # Using this mirror lets us use mypyc-compiled black, which is about 2x faster
+  - repo: https://github.com/psf/black-pre-commit-mirror
+    rev: 24.8.0
+    hooks:
+      - id: black
+        # It is recommended to specify the latest version of Python
+        # supported by your project here, or alternatively use
+        # pre-commit's default_language_version, see
+        # https://pre-commit.com/#top_level-default_language_version
+        language_version: python3.11
diff --git a/.pytest.ini b/.pytest.ini
new file mode 100644
index 0000000..6236693
--- /dev/null
+++ b/.pytest.ini
@@ -0,0 +1,10 @@
+[pytest]
+
+# Treat warnings as errors:
+filterwarnings =
+	error
+
+addopts = --doctest-glob '*.md' --doctest-modules
+
+# If an xfail starts passing unexpectedly, that should count as a failure:
+xfail_strict=true
diff --git a/README.md b/README.md
index 78109c8..9ab5d15 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,19 @@
 # llm-for-dp-qa
 Experiments with LLMs for Q+A about DP
+
+To get started:
+```
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+pre-commit install
+cp llm_for_dp_qa/credentials{-template,}.yaml
+```
+
+Fill in the git-ignored `credentials.yaml` with the [key and secret for this app](https://portal.apis.huit.harvard.edu/my-apps/6dce5383-bcb6-4c9f-bd14-8f59d356b221).
+
+You should then be able to run the queries against the API: The output will also be written to `outputs/`:
+```
+llm_for_dp_qa/run.py
+ls llm_for_dp_qa/outputs
+```
diff --git a/ci.sh b/ci.sh
new file mode 100755
index 0000000..6b85cbb
--- /dev/null
+++ b/ci.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+set -euo pipefail
+
+pytest -vv --failed-first
diff --git a/llm_for_dp_qa/__init__.py b/llm_for_dp_qa/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/llm_for_dp_qa/credentials-template.yaml b/llm_for_dp_qa/credentials-template.yaml
new file mode 100644
index 0000000..7ea2e57
--- /dev/null
+++ b/llm_for_dp_qa/credentials-template.yaml
@@ -0,0 +1,4 @@
+# Fill this in with the values at
+# https://portal.apis.huit.harvard.edu/my-apps/6dce5383-bcb6-4c9f-bd14-8f59d356b221
+key: fill-in
+secret: fill-in
diff --git a/llm_for_dp_qa/outputs/2025-01-15T15-01-22.yaml b/llm_for_dp_qa/outputs/2025-01-15T15-01-22.yaml
new file mode 100644
index 0000000..1d62005
--- /dev/null
+++ b/llm_for_dp_qa/outputs/2025-01-15T15-01-22.yaml
@@ -0,0 +1,29 @@
+- a:
+  - "Differential privacy is a mathematical framework designed to provide privacy\
+    \ guarantees when analyzing and sharing data. It aims to protect individual privacy\
+    \ while allowing for useful insights to be drawn from datasets. The key idea is\
+    \ to ensure that the output of a data analysis algorithm does not significantly\
+    \ depend on any single individual's data, making it difficult to infer whether\
+    \ any specific individual's information was included in the dataset.\n\nHere are\
+    \ the main concepts behind differential privacy:\n\n1. **Noise Addition**: To\
+    \ achieve differential privacy, random noise is added to the results of queries\
+    \ on the dataset. This noise is calibrated based on the sensitivity of the query,\
+    \ which measures how much the output could change if a single individual's data\
+    \ were included or excluded.\n\n2. **Privacy Parameter (\u03B5)**: Differential\
+    \ privacy is often quantified using a parameter \u03B5 (epsilon), which controls\
+    \ the trade-off between privacy and accuracy. A smaller \u03B5 provides stronger\
+    \ privacy guarantees but can result in less accurate results, while a larger \u03B5\
+    \ allows for more accurate results at the expense of privacy.\n\n3. **Indistinguishability**:\
+    \ The core principle of differential privacy is that an observer should not be\
+    \ able to determine whether a particular individual's data was included in the\
+    \ dataset based on the output of the analysis. This is achieved by ensuring that\
+    \ the outputs of the algorithm are statistically indistinguishable whether or\
+    \ not the individual's data is included.\n\n4. **Robustness**: Differential privacy\
+    \ provides a robust framework that can be applied to various types of data analysis,\
+    \ including statistical queries, machine learning, and data sharing.\n\nDifferential\
+    \ privacy has gained significant attention and adoption in various fields, including\
+    \ healthcare, social sciences, and technology companies, particularly for protecting\
+    \ user data in applications like Apple and Google.\n\nOverall, differential privacy\
+    \ strikes a balance between the utility of data analysis and the need to protect\
+    \ individual privacy, making it a powerful tool in the era of big data."
+  q: What is differential privacy?
diff --git a/llm_for_dp_qa/outputs/2025-01-15T15-18-47.yaml b/llm_for_dp_qa/outputs/2025-01-15T15-18-47.yaml
new file mode 100644
index 0000000..15218b8
--- /dev/null
+++ b/llm_for_dp_qa/outputs/2025-01-15T15-18-47.yaml
@@ -0,0 +1,62 @@
+- q: What is differential privacy?
+  a:
+  - 'Differential privacy is a mathematical framework designed to provide a formal
+    definition of privacy protection when analyzing and sharing data. The core idea
+    is to ensure that the inclusion or exclusion of a single individual''s data in
+    a dataset does not significantly affect the outcome of any analysis performed
+    on that dataset. This means that the results of queries or analyses conducted
+    on the data should be similar, whether or not any particular individual''s data
+    is included.
+
+
+    ### Key Concepts:
+
+
+    1. **Privacy Budget (ε)**: Differential privacy uses a parameter known as epsilon
+    (ε), which quantifies the level of privacy protection. A smaller ε indicates stronger
+    privacy guarantees, while a larger ε allows for more accurate data analysis but
+    weaker privacy.
+
+
+    2. **Noise Addition**: To achieve differential privacy, noise is typically added
+    to the results of queries on the data. This noise is often drawn from a mathematical
+    distribution (like Laplace or Gaussian) and is calibrated based on the sensitivity
+    of the query—how much the output can change with the addition or removal of a
+    single individual''s data.
+
+
+    3. **Sensitivity**: The sensitivity of a function is a measure of how much the
+    function''s output can change when a single individual''s data is added or removed.
+    Functions with lower sensitivity are generally easier to protect with differential
+    privacy.
+
+
+    4. **Indistinguishability**: The fundamental goal of differential privacy is to
+    make it difficult for an observer to determine whether any particular individual''s
+    data was included in the analysis, thus protecting individual privacy.
+
+
+    ### Applications:
+
+
+    Differential privacy has been adopted in various fields, including:
+
+
+    - **Statistics and Data Analysis**: To release aggregate statistics without compromising
+    individual privacy.
+
+    - **Machine Learning**: To train models on sensitive datasets while protecting
+    the privacy of the training data.
+
+    - **Public Data Releases**: Governments and organizations use differential privacy
+    to release census data or other sensitive information while maintaining the confidentiality
+    of individuals.
+
+
+    ### Conclusion:
+
+
+    Differential privacy provides a robust framework for balancing the needs for data
+    utility and individual privacy. By carefully managing the trade-offs between accuracy
+    and privacy, it allows organizations to analyze and share data without exposing
+    sensitive information about individuals.'
diff --git a/llm_for_dp_qa/outputs/2025-01-15T15-43-06.yaml b/llm_for_dp_qa/outputs/2025-01-15T15-43-06.yaml
new file mode 100644
index 0000000..7b82680
--- /dev/null
+++ b/llm_for_dp_qa/outputs/2025-01-15T15-43-06.yaml
@@ -0,0 +1,13 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0.7
+  git_hash: 1219e73
+  datetime: '2025-01-15T15:43:06.734594'
+q_and_a:
+- q: In a few words, what is differential privacy?
+  a:
+  - Differential privacy is a technique used to ensure that the output of a database
+    query does not reveal too much information about any individual record in the
+    database, thereby protecting individual privacy while still allowing for useful
+    data analysis.
diff --git a/llm_for_dp_qa/outputs/2025-01-16T15-57-55.yaml b/llm_for_dp_qa/outputs/2025-01-16T15-57-55.yaml
new file mode 100644
index 0000000..7c9f978
--- /dev/null
+++ b/llm_for_dp_qa/outputs/2025-01-16T15-57-55.yaml
@@ -0,0 +1,22 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+  git_hash: cdb186f
+  datetime: '2025-01-16T15:57:55.953321'
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human answer: 'Differential privacy is a rigorous mathematical definition of privacy.
+    Consider an algorithm that analyzes a dataset and releases statistics: The algorithm
+    is differentially private if by looking at the output, you cannot tell whether
+    any individual''s data was included in the original dataset or not. Differential
+    privacy achieves this by carefully injecting random noise into the released statistics
+    to hide the effects of each individual.
+
+    '
+  llm answers:
+  - Differential privacy is a mathematical framework that ensures the privacy of individuals
+    in a dataset by adding noise to the data or query results, making it difficult
+    to identify any individual's information while still allowing for useful aggregate
+    insights.
+  runtime: '0:00:03.099645'
diff --git a/llm_for_dp_qa/outputs/2025-01-16T16-29-09.yaml b/llm_for_dp_qa/outputs/2025-01-16T16-29-09.yaml
new file mode 100644
index 0000000..3306712
--- /dev/null
+++ b/llm_for_dp_qa/outputs/2025-01-16T16-29-09.yaml
@@ -0,0 +1,22 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+  datetime: '2025-01-16T16:29:09.845071'
+  git_hash: 0eb6250
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human_answer: 'Differential privacy is a rigorous mathematical definition of privacy.
+    Consider an algorithm that analyzes a dataset and releases statistics: The algorithm
+    is differentially private if by looking at the output, you cannot tell whether
+    any individual''s data was included in the original dataset or not. Differential
+    privacy achieves this by carefully injecting random noise into the released statistics
+    to hide the effects of each individual.
+
+    '
+  llm_answers:
+  - Differential privacy is a mathematical framework that ensures the privacy of individuals
+    in a dataset by adding noise to the data or the results of queries, making it
+    difficult to identify any individual's information while still allowing for useful
+    aggregate insights.
+  runtime: '0:00:02.060478'
diff --git a/llm_for_dp_qa/outputs/2025-01-16T16-29-59.yaml b/llm_for_dp_qa/outputs/2025-01-16T16-29-59.yaml
new file mode 100644
index 0000000..92cca0f
--- /dev/null
+++ b/llm_for_dp_qa/outputs/2025-01-16T16-29-59.yaml
@@ -0,0 +1,23 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0.99
+  datetime: '2025-01-16T16:29:59.318412'
+  git_hash: 0eb6250
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human_answer: 'Differential privacy is a rigorous mathematical definition of privacy.
+    Consider an algorithm that analyzes a dataset and releases statistics: The algorithm
+    is differentially private if by looking at the output, you cannot tell whether
+    any individual''s data was included in the original dataset or not. Differential
+    privacy achieves this by carefully injecting random noise into the released statistics
+    to hide the effects of each individual.
+
+    '
+  llm_answers:
+  - Differential privacy is a framework designed to provide strong privacy guarantees
+    when analyzing and sharing statistical data. It ensures that the inclusion or
+    exclusion of an individual's data does not significantly affect the output of
+    a function, thereby protecting individual privacy while still allowing for useful
+    insights from aggregated data.
+  runtime: '0:00:01.193081'
diff --git a/llm_for_dp_qa/outputs/2025-01-16T17-42-52.yaml b/llm_for_dp_qa/outputs/2025-01-16T17-42-52.yaml
new file mode 100644
index 0000000..ced4c2d
--- /dev/null
+++ b/llm_for_dp_qa/outputs/2025-01-16T17-42-52.yaml
@@ -0,0 +1,37 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+  datetime: '2025-01-16T17:42:52.450949'
+  git_hash: 550587f
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human_answers:
+  - evalution: Does the answer mention the rigorous mathematical definition of differential
+      privacy?
+    expected: true
+    actual:
+    - 'Yes'
+  - evalution: Does the answer describe adding calibrated noise to statistics?
+    expected: true
+    actual:
+    - Yes.
+  - evalution: Does the answer describe anonymizing data by stripping identifiers?
+    expected: false
+    actual:
+    - 'No'
+  llm_answers:
+  - evalution: Does the answer mention the rigorous mathematical definition of differential
+      privacy?
+    expected: true
+    actual:
+    - 'No'
+  - evalution: Does the answer describe adding calibrated noise to statistics?
+    expected: true
+    actual:
+    - Yes.
+  - evalution: Does the answer describe anonymizing data by stripping identifiers?
+    expected: false
+    actual:
+    - 'No'
+  runtime: '0:00:01.043781'
diff --git a/llm_for_dp_qa/outputs/2025-01-16T18-14-39.yaml b/llm_for_dp_qa/outputs/2025-01-16T18-14-39.yaml
new file mode 100644
index 0000000..3c270a4
--- /dev/null
+++ b/llm_for_dp_qa/outputs/2025-01-16T18-14-39.yaml
@@ -0,0 +1,49 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+  datetime: '2025-01-16T18:14:39.582095'
+  git_hash: a25fc61
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human:
+    ? 'Differential privacy is a rigorous mathematical definition of privacy. Consider
+      an algorithm that analyzes a dataset and releases statistics: The algorithm
+      is differentially private if by looking at the output, you cannot tell whether
+      any individual''s data was included in the original dataset or not. Differential
+      privacy achieves this by carefully injecting random noise into the released
+      statistics to hide the effects of each individual.
+
+      '
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual:
+        - 'Yes'
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual:
+        - Yes.
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual:
+        - 'No'
+  llm:
+    ? Differential privacy is a mathematical framework that ensures the privacy of
+      individuals in a dataset by adding noise to the data or the query results, making
+      it difficult to identify any individual's information while still allowing for
+      useful aggregate insights.
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual:
+        - 'No'
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual:
+        - Yes.
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual:
+        - 'No'
+  runtime: '0:00:01.491051'
diff --git a/llm_for_dp_qa/outputs/2025-01-17T10-12-30.yaml b/llm_for_dp_qa/outputs/2025-01-17T10-12-30.yaml
new file mode 100644
index 0000000..3acab07
--- /dev/null
+++ b/llm_for_dp_qa/outputs/2025-01-17T10-12-30.yaml
@@ -0,0 +1,50 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+    pre: ''
+  datetime: '2025-01-17T10:12:30.943123'
+  git_hash: a0793a2
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human:
+    ? 'Differential privacy is a rigorous mathematical definition of privacy. Consider
+      an algorithm that analyzes a dataset and releases statistics: The algorithm
+      is differentially private if by looking at the output, you cannot tell whether
+      any individual''s data was included in the original dataset or not. Differential
+      privacy achieves this by carefully injecting random noise into the released
+      statistics to hide the effects of each individual.
+
+      '
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual:
+        - Yes.
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual:
+        - Yes.
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual:
+        - 'No'
+  llm:
+    ? Differential privacy is a mathematical framework that ensures the privacy of
+      individuals in a dataset by adding noise to the data or the results of queries,
+      making it difficult to identify any individual's information while still allowing
+      for useful aggregate insights.
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual:
+        - 'No'
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual:
+        - Yes.
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual:
+        - 'No'
+  runtime: '0:00:01.027454'
diff --git a/llm_for_dp_qa/outputs/2025-01-17T10-55-36.yaml b/llm_for_dp_qa/outputs/2025-01-17T10-55-36.yaml
new file mode 100644
index 0000000..4988bdc
--- /dev/null
+++ b/llm_for_dp_qa/outputs/2025-01-17T10-55-36.yaml
@@ -0,0 +1,49 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+    system: You are an expert answering questions for a five year old.
+  datetime: '2025-01-17T10:55:36.164764'
+  git_hash: 3ef7bdd
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human:
+    ? 'Differential privacy is a rigorous mathematical definition of privacy. Consider
+      an algorithm that analyzes a dataset and releases statistics: The algorithm
+      is differentially private if by looking at the output, you cannot tell whether
+      any individual''s data was included in the original dataset or not. Differential
+      privacy achieves this by carefully injecting random noise into the released
+      statistics to hide the effects of each individual.
+
+      '
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual:
+        - 'Yes'
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual:
+        - Yes.
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual:
+        - 'No'
+  llm:
+    ? Differential privacy is like a magic cloak that helps keep your secrets safe
+      when sharing information. It makes sure that even if someone looks at the data,
+      they can't tell anything special about you!
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual:
+        - 'No'
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual:
+        - 'No'
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual:
+        - 'No'
+  runtime: '0:00:01.270520'
diff --git a/llm_for_dp_qa/outputs/2025-01-17T11-12-04.yaml b/llm_for_dp_qa/outputs/2025-01-17T11-12-04.yaml
new file mode 100644
index 0000000..dfb35d3
--- /dev/null
+++ b/llm_for_dp_qa/outputs/2025-01-17T11-12-04.yaml
@@ -0,0 +1,47 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+    system: You are an expert answering questions for a five year old.
+  datetime: '2025-01-17T11:12:04.698731'
+  git_hash: 57b44ea
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human:
+    ? 'Differential privacy is a rigorous mathematical definition of privacy. Consider
+      an algorithm that analyzes a dataset and releases statistics: The algorithm
+      is differentially private if by looking at the output, you cannot tell whether
+      any individual''s data was included in the original dataset or not. Differential
+      privacy achieves this by carefully injecting random noise into the released
+      statistics to hide the effects of each individual.
+
+      '
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual:
+        - '{"answer_satisfies_criteria":true}'
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual:
+        - '{"answer_satisfies_criteria":true}'
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual:
+        - '{"answer_satisfies_criteria":false}'
+  llm:
+    '{"answer_satisfies_criteria":true}':
+    - evalution: Does the answer mention the rigorous mathematical definition of differential
+        privacy?
+      expected: true
+      actual:
+      - '{"answer_satisfies_criteria":false}'
+    - evalution: Does the answer describe adding calibrated noise to statistics?
+      expected: true
+      actual:
+      - '{"answer_satisfies_criteria":false}'
+    - evalution: Does the answer describe anonymizing data by stripping identifiers?
+      expected: false
+      actual:
+      - '{"answer_satisfies_criteria":false}'
+  runtime: '0:00:00.551525'
diff --git a/llm_for_dp_qa/outputs/2025-01-17T11-25-46.yaml b/llm_for_dp_qa/outputs/2025-01-17T11-25-46.yaml
new file mode 100644
index 0000000..43930d5
--- /dev/null
+++ b/llm_for_dp_qa/outputs/2025-01-17T11-25-46.yaml
@@ -0,0 +1,49 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+    system: You are an expert answering questions for a five year old.
+  datetime: '2025-01-17T11:25:46.187147'
+  git_hash: 57b44ea
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human:
+    ? 'Differential privacy is a rigorous mathematical definition of privacy. Consider
+      an algorithm that analyzes a dataset and releases statistics: The algorithm
+      is differentially private if by looking at the output, you cannot tell whether
+      any individual''s data was included in the original dataset or not. Differential
+      privacy achieves this by carefully injecting random noise into the released
+      statistics to hide the effects of each individual.
+
+      '
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual:
+        - 'Yes'
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual:
+        - 'Yes'
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual:
+        - 'No'
+  llm:
+    ? Differential privacy is like a magic cloak that helps keep your secrets safe
+      when sharing information. It makes sure that even if someone looks at the data,
+      they can't tell anything special about you!
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual:
+        - 'No'
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual:
+        - 'No'
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual:
+        - 'No'
+  runtime: '0:00:01.013878'
diff --git a/llm_for_dp_qa/outputs/2025-01-17T11-30-16.yaml b/llm_for_dp_qa/outputs/2025-01-17T11-30-16.yaml
new file mode 100644
index 0000000..763a070
--- /dev/null
+++ b/llm_for_dp_qa/outputs/2025-01-17T11-30-16.yaml
@@ -0,0 +1,43 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+    system: You are an expert answering questions for a five year old.
+  datetime: '2025-01-17T11:30:16.015934'
+  git_hash: 57b44ea
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human:
+    ? 'Differential privacy is a rigorous mathematical definition of privacy. Consider
+      an algorithm that analyzes a dataset and releases statistics: The algorithm
+      is differentially private if by looking at the output, you cannot tell whether
+      any individual''s data was included in the original dataset or not. Differential
+      privacy achieves this by carefully injecting random noise into the released
+      statistics to hide the effects of each individual.
+
+      '
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual: true
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual: true
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual: false
+  llm:
+    ? Differential privacy is like a magic cloak that helps keep your secrets safe
+      when sharing information. It makes sure that even if someone looks at the data,
+      they can't tell anything special about you!
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual: false
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual: false
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual: false
+  runtime: '0:00:00.987728'
diff --git a/llm_for_dp_qa/outputs/2025-01-17T12-13-41.yaml b/llm_for_dp_qa/outputs/2025-01-17T12-13-41.yaml
new file mode 100644
index 0000000..4fb25ee
--- /dev/null
+++ b/llm_for_dp_qa/outputs/2025-01-17T12-13-41.yaml
@@ -0,0 +1,46 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+    system: You are an expert answering questions for a five year old.
+  datetime: '2025-01-17T12:13:41.387801'
+  git_hash: eae75e8
+scores:
+  human: 3 / 3
+  llm: 1 / 3
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human:
+    ? 'Differential privacy is a rigorous mathematical definition of privacy. Consider
+      an algorithm that analyzes a dataset and releases statistics: The algorithm
+      is differentially private if by looking at the output, you cannot tell whether
+      any individual''s data was included in the original dataset or not. Differential
+      privacy achieves this by carefully injecting random noise into the released
+      statistics to hide the effects of each individual.
+
+      '
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual: true
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual: true
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual: false
+  llm:
+    ? Differential privacy is like a magic cloak that helps keep your secrets safe
+      when sharing information. It makes sure that even if someone looks at the data,
+      they can't tell anything special about you!
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual: false
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual: false
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual: false
+  runtime: '0:00:01.046167'
diff --git a/llm_for_dp_qa/outputs/2025-01-17T13-58-38.yaml b/llm_for_dp_qa/outputs/2025-01-17T13-58-38.yaml
new file mode 100644
index 0000000..577fa39
--- /dev/null
+++ b/llm_for_dp_qa/outputs/2025-01-17T13-58-38.yaml
@@ -0,0 +1,187 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+    system: null
+  datetime: '2025-01-17T13:58:38.016689'
+  git_hash: 7c5cc9e
+scores:
+  human: 3 / 3
+  llm: 0 / 3
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human:
+    ? 'Differential privacy is a rigorous mathematical definition of privacy. Consider
+      an algorithm that analyzes a dataset and releases statistics: The algorithm
+      is differentially private if by looking at the output, you cannot tell whether
+      any individual''s data was included in the original dataset or not. Differential
+      privacy achieves this by carefully injecting random noise into the released
+      statistics to hide the effects of each individual.
+
+      '
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual: true
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual: true
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual: false
+  llm:
+    ? Differential privacy is a mathematical framework that ensures the privacy of
+      individuals in a dataset by adding noise to the data or the query results, making
+      it difficult to identify any individual's information while still allowing for
+      useful aggregate insights.
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual: false
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual: true
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual: false
+  runtime: '0:00:01.747258'
+- question: 'I have an extremely useless question about the Bernoulli sampler used
+    in OpenDP. I have been working on implementing a the same idea for another language,
+    and was using the implementation in OpenDP as a reference, and I noticed something
+    about line 120:
+
+    ```
+
+    i if i == leading_zeros => !prob.raw_exponent().is_zero()
+
+    ```
+
+    This is the case where Geo(1/2)=leading_zeros, given that we are dealing with
+    valid probabilities in (0,1), this line will only ever return false for subnormal
+    floats, but for some subnormal floats, such as the floatmin(Float64), this changes
+    their probability from minuscule to 0. I''ve also seen that the largest subnormal
+    (2^-1022)(1-2^-52), has probability 1.11e-308 of returning true in the current
+    state versus 2.225e-308 (which is closer to its real value) if this line was just
+    "return true". Now I don''t think this change should be made, because it biases
+    subnormals slightly more towards true , which seems worse than undershooting,
+    but I was curious to know if there''s a better way to handle the subnormal case
+    in my version.
+
+    '
+  human:
+    ? "I'm not quite following why for FloatMin(float64) this implementation always\
+      \ returns false. For such subnormals, there is some (small) probability that\
+      \ first_heads_index lands in the part of the binary expansion represented by\
+      \ the mantissa bits (line 124). In particular, line 120 is considering the case\
+      \ that first_heads_index lands on the bit in the binary expansion that is represented\
+      \ by the implicit bit, but I think I'm missing how this biases the probability\
+      \ to always be 0 \n"
+    : - evalution: Does the answer reflect some confusion about the question?
+        expected: true
+        actual: true
+      - evalution: Does the answer mention that there is some (small) probability
+          that first_heads_index lands in the part of the binary expansion represented
+          by the mantissa bits?
+        expected: true
+        actual: true
+      - evalution: Does the answer point out that line 120 is considering the case
+          that first_heads_index lands on the bit in the binary expansion that is
+          represented by the implicit bit?
+        expected: true
+        actual: true
+  llm:
+    ? 'Your question about the handling of subnormal floats in the Bernoulli sampler
+      is quite insightful, and it touches on an important aspect of numerical stability
+      and precision in probabilistic algorithms.
+
+
+      In the context of the Bernoulli sampler, the line you referenced is indeed a
+      way to handle the edge case of very small probabilities, particularly subnormal
+      floats. The intention behind this line is to ensure that the sampler behaves
+      correctly for probabilities that are extremely close to zero, while also avoiding
+      potential pitfalls associated with floating-point precision.
+
+
+      ### Understanding the Issue
+
+
+      1. **Subnormal Floats**: Subnormal (or denormal) floats are used to represent
+      numbers that are very close to zero but not exactly zero. They allow for gradual
+      underflow, which can be important in maintaining precision in calculations involving
+      very small numbers.
+
+
+      2. **Biasing Towards True**: The line in question effectively introduces a condition
+      that can lead to a slight bias in favor of returning `true` for certain subnormal
+      values. This can be seen as a way to avoid the situation where a very small
+      probability is treated as zero, which would lead to a complete loss of information
+      about the event.
+
+
+      3. **Precision vs. Bias**: As you noted, this approach can lead to a situation
+      where the probability of returning `true` for certain subnormal values is higher
+      than it would be if the line simply returned `true` for all cases. This can
+      introduce a bias, but it also helps to ensure that the sampler does not completely
+      ignore very small probabilities.
+
+
+      ### Possible Solutions
+
+
+      If you''re looking for a better way to handle subnormal cases in your implementation,
+      here are a few strategies you might consider:
+
+
+      1. **Thresholding**: Introduce a small threshold value (e.g., `epsilon`) below
+      which you treat probabilities as zero. This way, you can avoid the complications
+      of subnormal floats while still maintaining a clear distinction between very
+      small probabilities and zero.
+
+
+      2. **Custom Handling for Subnormals**: Instead of the current logic, you could
+      implement a specific case for subnormal floats. For example, you could check
+      if the probability is subnormal and handle it differently, perhaps by using
+      a different sampling strategy or adjusting the probability slightly to avoid
+      the pitfalls of floating-point precision.
+
+
+      3. **Logarithmic Representation**: Consider using a logarithmic representation
+      for probabilities. This can help mitigate issues with very small values, as
+      you can work with log-probabilities instead. When sampling, you can convert
+      back to the original scale.
+
+
+      4. **Monte Carlo Sampling**: If the probabilities are extremely small, you might
+      consider using a Monte Carlo approach where you sample multiple times and aggregate
+      results, which can help smooth out the effects of subnormal values.
+
+
+      5. **Documentation and Testing**: Regardless of the approach you choose, ensure
+      that you document the behavior of your implementation clearly, especially regarding
+      how it handles edge cases like subnormal floats. Additionally, thorough testing
+      with a range of probability values, including subnormals, will help you understand
+      the implications of your design choices.
+
+
+      ### Conclusion
+
+
+      Ultimately, the choice of how to handle subnormal floats in your Bernoulli sampler
+      will depend on the specific requirements of your application and the trade-offs
+      you are willing to make between precision and bias. It''s a nuanced decision,
+      and careful consideration of the implications of each approach will help you
+      arrive at a solution that meets your needs.'
+    : - evalution: Does the answer reflect some confusion about the question?
+        expected: true
+        actual: false
+      - evalution: Does the answer mention that there is some (small) probability
+          that first_heads_index lands in the part of the binary expansion represented
+          by the mantissa bits?
+        expected: true
+        actual: false
+      - evalution: Does the answer point out that line 120 is considering the case
+          that first_heads_index lands on the bit in the binary expansion that is
+          represented by the implicit bit?
+        expected: true
+        actual: false
+  runtime: '0:00:12.800528'
diff --git a/llm_for_dp_qa/outputs/2025-01-17T14-49-17.yaml b/llm_for_dp_qa/outputs/2025-01-17T14-49-17.yaml
new file mode 100644
index 0000000..5493eb1
--- /dev/null
+++ b/llm_for_dp_qa/outputs/2025-01-17T14-49-17.yaml
@@ -0,0 +1,179 @@
+metadata:
+  config:
+    model: gpt-4o-mini
+    temperature: 0
+    system: null
+  datetime: '2025-01-17T14:49:17.021489'
+  git_hash: 7c5cc9e
+scores:
+  human: 6 / 6
+  llm: 2 / 6
+q_and_a:
+- question: In a few words, what is differential privacy?
+  human:
+    ? 'Differential privacy is a rigorous mathematical definition of privacy. Consider
+      an algorithm that analyzes a dataset and releases statistics: The algorithm
+      is differentially private if by looking at the output, you cannot tell whether
+      any individual''s data was included in the original dataset or not. Differential
+      privacy achieves this by carefully injecting random noise into the released
+      statistics to hide the effects of each individual.
+
+      '
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual: true
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual: true
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual: false
+  llm:
+    ? Differential privacy is a mathematical framework that ensures the privacy of
+      individuals in a dataset by adding noise to the data or the query results, making
+      it difficult to identify any individual's information while still allowing for
+      useful aggregate insights.
+    : - evalution: Does the answer mention the rigorous mathematical definition of
+          differential privacy?
+        expected: true
+        actual: false
+      - evalution: Does the answer describe adding calibrated noise to statistics?
+        expected: true
+        actual: true
+      - evalution: Does the answer describe anonymizing data by stripping identifiers?
+        expected: false
+        actual: false
+  runtime: '0:00:01.030129'
+- question: 'I have an extremely useless question about the Bernoulli sampler used
+    in OpenDP. I have been working on implementing a the same idea for another language,
+    and was using the implementation in OpenDP as a reference, and I noticed something
+    about line 120:
+
+    ```
+
+    i if i == leading_zeros => !prob.raw_exponent().is_zero()
+
+    ```
+
+    This is the case where Geo(1/2)=leading_zeros, given that we are dealing with
+    valid probabilities in (0,1), this line will only ever return false for subnormal
+    floats, but for some subnormal floats, such as the floatmin(Float64), this changes
+    their probability from minuscule to 0. I''ve also seen that the largest subnormal
+    (2^-1022)(1-2^-52), has probability 1.11e-308 of returning true in the current
+    state versus 2.225e-308 (which is closer to its real value) if this line was just
+    "return true". Now I don''t think this change should be made, because it biases
+    subnormals slightly more towards true , which seems worse than undershooting,
+    but I was curious to know if there''s a better way to handle the subnormal case
+    in my version.
+
+    '
+  human:
+    ? "I'm not quite following why for FloatMin(float64) this implementation always\
+      \ returns false. For such subnormals, there is some (small) probability that\
+      \ first_heads_index lands in the part of the binary expansion represented by\
+      \ the mantissa bits (line 124). In particular, line 120 is considering the case\
+      \ that first_heads_index lands on the bit in the binary expansion that is represented\
+      \ by the implicit bit, but I think I'm missing how this biases the probability\
+      \ to always be 0 \n"
+    : - evalution: Does the answer reflect some confusion about the question?
+        expected: true
+        actual: true
+      - evalution: Does the answer mention that there is some (small) probability
+          that first_heads_index lands in the part of the binary expansion represented
+          by the mantissa bits?
+        expected: true
+        actual: true
+      - evalution: Does the answer point out that line 120 is considering the case
+          that first_heads_index lands on the bit in the binary expansion that is
+          represented by the implicit bit?
+        expected: true
+        actual: true
+  llm:
+    ? 'Your question about the handling of subnormal floats in the Bernoulli sampler
+      is quite insightful, and it touches on an important aspect of numerical stability
+      and precision in probabilistic algorithms.
+
+
+      In the context of the Bernoulli sampler, the line you referenced is indeed a
+      way to handle the edge case of very small probabilities, particularly subnormal
+      floats. The intention behind this line is to ensure that the sampler behaves
+      correctly even when dealing with probabilities that are extremely close to zero.
+
+
+      ### Understanding the Issue
+
+
+      1. **Subnormal Floats**: Subnormal (or denormal) floats are numbers that are
+      too small to be represented in the normal floating-point format. They allow
+      for representation of values closer to zero than the smallest normal float,
+      but they come with reduced precision. This can lead to unexpected behavior in
+      probabilistic algorithms if not handled carefully.
+
+
+      2. **Biasing Towards True**: The line in question effectively introduces a condition
+      that can lead to a bias in favor of returning `true` for certain subnormal values.
+      This can be problematic because it may skew the intended probabilities, especially
+      if the goal is to maintain a uniform sampling distribution.
+
+
+      ### Possible Solutions
+
+
+      Here are a few strategies you might consider for handling subnormal floats in
+      your implementation:
+
+
+      1. **Thresholding**: Instead of using the raw exponent to determine the behavior
+      for subnormal floats, you could introduce a threshold. For example, if the probability
+      is below a certain small value (e.g., `Float64::min_positive()`), you could
+      treat it as zero. This way, you avoid the bias introduced by the current implementation.
+
+
+      2. **Direct Comparison**: You could directly compare the probability against
+      a small epsilon value. If the probability is less than this epsilon, return
+      `false`. This would ensure that very small probabilities are treated as zero
+      without introducing bias.
+
+
+      3. **Logarithmic Approach**: Consider using logarithmic probabilities. Instead
+      of working directly with probabilities, you could work with their logarithms.
+      This can help avoid issues with underflow and provide a more stable way to handle
+      very small values.
+
+
+      4. **Custom Handling for Subnormals**: If you want to maintain the behavior
+      of the original implementation but avoid the bias, you could implement a custom
+      handling mechanism specifically for subnormal floats. For instance, you could
+      check if the probability is subnormal and apply a different logic that maintains
+      the intended distribution.
+
+
+      5. **Testing and Validation**: Regardless of the approach you choose, it''s
+      crucial to validate the behavior of your implementation through extensive testing.
+      Ensure that the sampling distribution remains consistent with the expected probabilities,
+      especially in edge cases involving subnormal floats.
+
+
+      ### Conclusion
+
+
+      The handling of subnormal floats in probabilistic algorithms is a nuanced issue
+      that requires careful consideration of both numerical stability and the intended
+      behavior of the algorithm. By implementing one of the strategies above, you
+      can mitigate the potential biases introduced by the original line of code while
+      still maintaining the integrity of the sampling process.'
+    : - evalution: Does the answer reflect some confusion about the question?
+        expected: true
+        actual: false
+      - evalution: Does the answer mention that there is some (small) probability
+          that first_heads_index lands in the part of the binary expansion represented
+          by the mantissa bits?
+        expected: true
+        actual: false
+      - evalution: Does the answer point out that line 120 is considering the case
+          that first_heads_index lands on the bit in the binary expansion that is
+          represented by the implicit bit?
+        expected: true
+        actual: false
+  runtime: '0:00:09.587726'
diff --git a/llm_for_dp_qa/q-and-a.yaml b/llm_for_dp_qa/q-and-a.yaml
new file mode 100644
index 0000000..1a9e164
--- /dev/null
+++ b/llm_for_dp_qa/q-and-a.yaml
@@ -0,0 +1,30 @@
+-
+  Q: In a few words, what is differential privacy?
+  A:
+    - |
+      Differential privacy is a rigorous mathematical definition of privacy. Consider an algorithm that analyzes a dataset and releases statistics: The algorithm is differentially private if by looking at the output, you cannot tell whether any individual's data was included in the original dataset or not. Differential privacy achieves this by carefully injecting random noise into the released statistics to hide the effects of each individual.
+  evaluations:
+    true:
+      - Does the answer mention the rigorous mathematical definition of differential privacy?
+      - Does the answer describe adding calibrated noise to statistics?
+    false:
+      - Does the answer describe anonymizing data by stripping identifiers?
+-
+  # OpenDP slack, #random channel, August 9, 2024
+  # https://opendp.slack.com/archives/C02L4CMC1LZ/p1723183400008439
+  # Tweaked question from Apigail Gentle:
+  Q: |
+    I have a question about the Bernoulli sampler used in OpenDP. I have been working on implementing a the same idea for another language, and was using the implementation in OpenDP as a reference, and I noticed something about line 120:
+    ```
+    i if i == leading_zeros => !prob.raw_exponent().is_zero()
+    ```
+    This is the case where Geo(1/2)=leading_zeros, given that we are dealing with valid probabilities in (0,1), this line will only ever return false for subnormal floats, but for some subnormal floats, such as the floatmin(Float64), this changes their probability from minuscule to 0. I've also seen that the largest subnormal (2^-1022)(1-2^-52), has probability 1.11e-308 of returning true in the current state versus 2.225e-308 (which is closer to its real value) if this line was just "return true". Now I don't think this change should be made, because it biases subnormals slightly more towards true , which seems worse than undershooting, but I was curious to know if there's a better way to handle the subnormal case in my version.
+  A:
+    # Tweaked response from Zachary Ratliff:
+    - |
+      I'm not quite following why for FloatMin(float64) this implementation always returns false. For such subnormals, there is some (small) probability that first_heads_index lands in the part of the binary expansion represented by the mantissa bits (line 124). In particular, line 120 is considering the case that first_heads_index lands on the bit in the binary expansion that is represented by the implicit bit, but I think I'm missing how this biases the probability to always be 0
+  evaluations:
+    true:
+      - Does the answer reflect some confusion about the question?
+      - Does the answer mention that there is some (small) probability that first_heads_index lands in the part of the binary expansion represented by the mantissa bits?
+      - Does the answer point out that line 120 is considering the case that first_heads_index lands on the bit in the binary expansion that is represented by the implicit bit?
diff --git a/llm_for_dp_qa/run.py b/llm_for_dp_qa/run.py
new file mode 100755
index 0000000..d826cea
--- /dev/null
+++ b/llm_for_dp_qa/run.py
@@ -0,0 +1,194 @@
+#!/usr/bin/env python3
+from yaml import safe_load, dump
+from pathlib import Path
+import re
+from datetime import datetime
+import subprocess
+import argparse
+import logging
+from openai import OpenAI, NOT_GIVEN
+from pydantic import BaseModel
+import json
+from collections import defaultdict
+
+
+def get_config():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default="gpt-4o-mini")
+    parser.add_argument("--temperature", type=float, default=0)
+    parser.add_argument("--system")
+    args = parser.parse_args()
+    return {
+        "model": args.model,
+        "temperature": args.temperature,
+        "system": args.system,
+    }
+
+
+def get_git_hash():
+    completed = subprocess.run(
+        "git rev-parse --short HEAD", shell=True, capture_output=True
+    )
+    return completed.stdout.decode().strip()
+
+
+def get_key():
+    credentials = load_yaml("credentials.yaml")
+    return credentials["key"]
+
+
+def load_yaml(file_name):
+    return safe_load((Path(__file__).parent / file_name).open())
+
+
+class Evaluation(BaseModel):
+    answer_satisfies_criteria: bool
+
+
+def ask_one_question(question, model, temperature, system, is_evaluation=False):
+    logging.info(f"Q: {question}")
+
+    start_time = datetime.now()
+    client = OpenAI(
+        base_url="https://go.apis.huit.harvard.edu/ais-openai-direct-limited-schools/v1",
+        api_key=get_key(),
+    )
+    messages = [{"role": "user", "content": question}]
+    if system:
+        messages.append({"role": "system", "content": system})
+    # Trying to use "client.chat.completions.create" produced an error:
+    # > You tried to pass a `BaseModel` class to `chat.completions.create()`;
+    # > You must use `beta.chat.completions.parse()` instead
+    completions = client.beta.chat.completions.parse(
+        messages=messages,  # type: ignore
+        model=model,
+        temperature=temperature,
+        response_format=Evaluation if is_evaluation else NOT_GIVEN,
+    )
+    end_time = datetime.now()
+
+    answers = [choice.message.content for choice in completions.choices]
+    for answer in answers:
+        logging.info(f"A: {answer}")
+    if is_evaluation:
+        answers = [
+            json.loads(answer)["answer_satisfies_criteria"]
+            for answer in answers
+            if answer
+        ][0]
+    return answers, (end_time - start_time)
+
+
+def ask_evaluation(question, answer, evaluation):
+    # For the evaluation, we want boring, reliable answers,
+    # even as we change the parameters for the primary query.
+    # Might surface these as a separate config at some point.
+    model = "gpt-4o-mini"
+    temperature = 0
+    system = None
+    question_answer_evaluation = f"""First, read the following question and answer pair:
+
+Question: {question}
+
+Answer:
+'''
+{answer}
+'''
+
+Considering the response above, answer the following question with "True" or "False":
+{evaluation}
+"""
+    return ask_one_question(
+        question_answer_evaluation, model, temperature, system, is_evaluation=True
+    )
+
+
+def evaluate_one_answer(question, answer, evaluations_in):
+    evaluations_out = []
+    for expected in [True, False]:
+        for evaluation in evaluations_in.get(expected, []):
+            actual, _runtime = ask_evaluation(question, answer, evaluation)
+            evaluations_out.append(
+                {
+                    "evalution": evaluation,
+                    "expected": expected,
+                    "actual": actual,
+                }
+            )
+    return evaluations_out
+
+
+def evaluate_all_answers(question, answers, evaluation_questions):
+    evaluation_answers = {}
+    for answer in answers:
+        evaluation = evaluate_one_answer(question, answer, evaluation_questions)
+        evaluation_answers[answer] = evaluation
+    return evaluation_answers
+
+
+def ask_all_questions(config):
+    q_and_a_in = load_yaml("q-and-a.yaml")
+    q_and_a_out = []
+    for q_a in q_and_a_in:
+        question = q_a["Q"]
+        evaluations = q_a["evaluations"]
+
+        human_answers = q_a["A"]
+        human_answers_evaluations = evaluate_all_answers(
+            question, human_answers, evaluations
+        )
+
+        llm_answers, runtime = ask_one_question(question, **config)
+        llm_answers_evaluations = evaluate_all_answers(
+            question, llm_answers, evaluations
+        )
+
+        q_and_a_out.append(
+            {
+                "question": question,
+                "human": human_answers_evaluations,
+                "llm": llm_answers_evaluations,
+                "runtime": str(runtime),
+            }
+        )
+    return q_and_a_out
+
+
+def get_scores(q_and_a):
+    correct = defaultdict(int)
+    total = defaultdict(int)
+    for human_llm in q_and_a:
+        for agent in ["human", "llm"]:
+            evaluations = human_llm[agent].values()
+            flat_list = [e for e_list in evaluations for e in e_list]
+            total[agent] += len(flat_list)
+            correct[agent] += sum(1 for e in flat_list if e["expected"] == e["actual"])
+
+    return {agent: f"{correct[agent]} / {total[agent]}" for agent in ["human", "llm"]}
+
+
+def save_results(datetime_now, results):
+    yaml_out = dump(
+        results,
+        sort_keys=False,
+        allow_unicode=True,
+        default_flow_style=False,
+    )
+    timestamp = re.sub(r"\..*", "", datetime_now).replace(":", "-")
+    out_path = Path(__file__).parent / "outputs" / f"{timestamp}.yaml"
+    out_path.write_text(yaml_out)
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    config = get_config()
+    datetime_now = datetime.now().isoformat()
+    metadata = {
+        "config": config,
+        "datetime": datetime_now,
+        "git_hash": get_git_hash(),
+    }
+    q_and_a = ask_all_questions(config)
+    scores = get_scores(q_and_a)
+    results = {"metadata": metadata, "scores": scores, "q_and_a": q_and_a}
+    save_results(datetime_now, results)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..9f59b2f
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+pyyaml
+pre-commit
+flake8
+flake8-bugbear
+black
+openai
+pytest
+pyright
diff --git a/tests/test_misc.py b/tests/test_misc.py
new file mode 100644
index 0000000..dee4ba9
--- /dev/null
+++ b/tests/test_misc.py
@@ -0,0 +1,14 @@
+import subprocess
+import pytest
+
+
+tests = {
+    "flake8 linting": "flake8 . --count --show-source --statistics",
+    "pyright type checking": "pyright",
+}
+
+
+@pytest.mark.parametrize("cmd", tests.values(), ids=tests.keys())
+def test_subprocess(cmd: str):
+    result = subprocess.run(cmd, shell=True)
+    assert result.returncode == 0, f'"{cmd}" failed'
diff --git a/tests/test_run.py b/tests/test_run.py
new file mode 100644
index 0000000..ed5a220
--- /dev/null
+++ b/tests/test_run.py
@@ -0,0 +1,39 @@
+from llm_for_dp_qa.run import get_scores
+
+
+def test_get_scores():
+    q_and_a = [
+        {
+            "human": {
+                "q1": [
+                    {"expected": True, "actual": True},
+                    {"expected": True, "actual": True},
+                ]
+            },
+            "llm": {
+                "q1": [
+                    {"expected": True, "actual": False},
+                    {"expected": True, "actual": False},
+                ]
+            },
+        },
+        {
+            "human": {
+                "q1": [
+                    {"expected": False, "actual": False},
+                ],
+                "q2": [
+                    {"expected": False, "actual": False},
+                ],
+            },
+            "llm": {
+                "q1": [
+                    {"expected": False, "actual": True},
+                ],
+                "q2": [
+                    {"expected": False, "actual": False},  # Let it get one right.
+                ],
+            },
+        },
+    ]
+    assert get_scores(q_and_a) == {"human": "4 / 4", "llm": "1 / 4"}