blueprints-for-text-analytics-python · bmerkle · Jan 4, 2025 · Jan 4, 2025 · Jan 4, 2025
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,8 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# files created by notebook instructions
+ch04/lid.176.ftz
+ch04/reddit_dataframe.pkl
+ch04/reddit-selfposts.db
diff --git a/ch05/Feature_Engineering_Similarity.ipynb b/ch05/Feature_Engineering_Similarity.ipynb
@@ -451,7 +451,7 @@
    "source": [
     "from spacy.lang.en.stop_words import STOP_WORDS as stopwords\n",
     "print(len(stopwords))\n",
-    "tfidf = TfidfVectorizer(stop_words=stopwords)\n",
+    "tfidf = TfidfVectorizer(stop_words=list(stopwords))\n",
     "dt = tfidf.fit_transform(headlines[\"headline_text\"])\n",
     "dt"
    ]
@@ -469,7 +469,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tfidf = TfidfVectorizer(stop_words=stopwords, min_df=2)\n",
+    "tfidf = TfidfVectorizer(stop_words=list(stopwords), min_df=2)\n",
     "dt = tfidf.fit_transform(headlines[\"headline_text\"])\n",
     "dt"
    ]
@@ -480,7 +480,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tfidf = TfidfVectorizer(stop_words=stopwords, min_df=.0001)\n",
+    "tfidf = TfidfVectorizer(stop_words=list(stopwords), min_df=.0001)\n",
     "dt = tfidf.fit_transform(headlines[\"headline_text\"])\n",
     "dt"
    ]
@@ -498,7 +498,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tfidf = TfidfVectorizer(stop_words=stopwords, max_df=0.1)\n",
+    "tfidf = TfidfVectorizer(stop_words=list(stopwords), max_df=0.1)\n",
     "dt = tfidf.fit_transform(headlines[\"headline_text\"])\n",
     "dt"
    ]
@@ -527,11 +527,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), min_df=2)\n",
+    "tfidf = TfidfVectorizer(stop_words=list(stopwords), ngram_range=(1,2), min_df=2)\n",
     "dt = tfidf.fit_transform(headlines[\"headline_text\"])\n",
     "print(dt.shape)\n",
     "print(dt.data.nbytes)\n",
-    "tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,3), min_df=2)\n",
+    "tfidf = TfidfVectorizer(stop_words=list(stopwords), ngram_range=(1,3), min_df=2)\n",
     "dt = tfidf.fit_transform(headlines[\"headline_text\"])\n",
     "print(dt.shape)\n",
     "print(dt.data.nbytes)"
@@ -575,7 +575,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tfidf = TfidfVectorizer(stop_words=stopwords)\n",
+    "tfidf = TfidfVectorizer(stop_words=list(stopwords))\n",
     "dt = tfidf.fit_transform(headlines[\"lemmas\"].map(str))\n",
     "dt"
    ]
@@ -586,7 +586,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tfidf = TfidfVectorizer(stop_words=stopwords)\n",
+    "tfidf = TfidfVectorizer(stop_words=list(stopwords))\n",
     "dt = tfidf.fit_transform(headlines[\"nav\"].map(str))\n",
     "dt"
    ]
@@ -634,7 +634,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tfidf = TfidfVectorizer(stop_words=stopwords, min_df=2)\n",
+    "tfidf = TfidfVectorizer(stop_words=list(stopwords), min_df=2)\n",
     "dt = tfidf.fit_transform(headlines[\"lemmas\"].map(str))\n",
     "dt"
    ]
@@ -690,7 +690,7 @@
    "source": [
     "# there are \"test\" headlines in the corpus\n",
     "stopwords.add(\"test\")\n",
-    "tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), min_df=2, norm='l2')\n",
+    "tfidf = TfidfVectorizer(stop_words=list(stopwords), ngram_range=(1,2), min_df=2, norm='l2')\n",
     "dt = tfidf.fit_transform(headlines[\"headline_text\"])"
    ]
   },
@@ -764,9 +764,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%%time\n",
@@ -841,7 +839,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tfidf_word = TfidfVectorizer(stop_words=stopwords, min_df=1000)\n",
+    "tfidf_word = TfidfVectorizer(stop_words=list(stopwords), min_df=1000)\n",
     "dt_word = tfidf_word.fit_transform(headlines[\"headline_text\"])"
    ]
   },
@@ -880,7 +878,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "blueprints",
    "language": "python",
    "name": "python3"
   },
@@ -894,7 +892,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.12.8"
   },
   "toc": {
    "base_numbering": 1,