-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlib.ts
139 lines (123 loc) · 3.89 KB
/
lib.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import { distinct } from 'https://deno.land/[email protected]/collections/distinct.ts';
import {
DOMParser,
Element,
HTMLDocument,
} from 'https://deno.land/x/[email protected]/deno-dom-wasm.ts';
import TurndownService from 'npm:turndown';
const turndownService = new TurndownService();
export async function crawl(
startUrl: string,
boundingPath?: string | null,
): Promise<Set<string>> {
const stack = [startUrl];
const visited: Set<string> = new Set();
const pageContents: Set<string> = new Set();
while (stack.length) {
const currentUrl = stack.pop();
if (!currentUrl) {
// Should never happen because of while (stack.length)
return visited;
}
// TODO: edge case. fetch error
// console.log(`${currentUrl} added to visited set`);
visited.add(currentUrl);
// Get links and add unvisited to stack
// console.log(`Crawling ${currentUrl} for URLs`);
const resp = await fetch(currentUrl);
const htmlString = await resp.text();
const document = new DOMParser().parseFromString(htmlString, 'text/html');
if (document) {
const main = document.querySelector('.main');
if (main) {
const markdown = turndownService.turndown(main.innerHTML);
// console.log(markdown);
pageContents.add(markdown);
} else {
// console.log('========');
// console.log(currentUrl);
// console.log('MARKDOWN');
const markdown = turndownService.turndown(document.body.innerHTML);
// console.log(markdown);
pageContents.add(markdown);
}
for (const url of getOnwardUrls(document, currentUrl)) {
if (
!visited.has(url) &&
(!boundingPath || url.includes(boundingPath)) &&
!stack.includes(url)
) {
// console.log(`Adding ${url} to stack`);
stack.push(url);
}
}
}
}
return pageContents;
}
function getOnwardUrls(document: HTMLDocument, baseUrl: string): string[] {
const anchorElements = [...document.querySelectorAll('a')] as Element[];
// console.log(`${anchorElements.length} anchor elements on page`);
const urls: string[] = anchorElements
.map((anchorElement) => {
const href = anchorElement.getAttribute('href');
if (href) {
// Second argument gets ignored if href is an absolute path
// console.log('href', href);
const url = new URL(href, baseUrl);
// console.log('url', url.toString());
url.search = '';
url.hash = '';
// console.log(`Stripped URL: ${url.toString()}`);
return url.toString();
} else {
// console.error("No href on anchor element");
return null;
}
})
.filter(notEmpty);
// console.log(`${urls.length} of which had hrefs`);
const distinctUrls = distinct(urls);
// console.log(`${distinctUrls.length} of which were distinct (after stripping hash and query string)`);
return distinctUrls;
}
export async function anthropic(prompt: string, apiKey: string) {
const claudePrompt = `\n\nHuman: ${prompt}\n\nAssistant:`;
const data = {
prompt: claudePrompt,
model: 'claude-v1-100k',
// A maximum number of tokens to generate before stopping.
max_tokens_to_sample: 300,
stop_sequences: ['\n\nHuman:'],
};
const options = {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'x-api-key': apiKey
},
body: JSON.stringify(data),
};
try {
console.log('Prompt: ', prompt);
console.log('fetching Anthropic completion...');
const resp = await fetch('https://api.anthropic.com/v1/complete', options);
const json = await resp.json();
// TODO: handle non-200 response
if (resp.status !== 200) {
console.error(`Got ${resp.status} from Anthropic`);
return { success: false, error: json };
}
return { success: true, data: json.completion };
} catch (err) {
// TODO: surface error in UI
console.error('Error getting completion: ', err);
return { success: false, error: err };
}
}
export function notEmpty<TValue>(
value: TValue | null | undefined,
): value is TValue {
if (value === null || value === undefined) return false;
return true;
}