Skip to content

Commit 1838c9e

Browse files
committed
refactor: autoloading and a class api
1 parent 32344c0 commit 1838c9e

File tree

7 files changed

+360
-312
lines changed

7 files changed

+360
-312
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
/vendor/
2+
/composer.lock

composer.json

+8-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,13 @@
99
"email": "[email protected]"
1010
}
1111
],
12+
"autoload": {
13+
"psr-4": {
14+
"CodeRevolutionPlugins\\GPT3Encoder\\": "src/"
15+
}
16+
},
1217
"minimum-stability": "stable",
13-
"require": {}
18+
"require": {
19+
"php": ">7.4"
20+
}
1421
}
File renamed without changes.
File renamed without changes.
File renamed without changes.

gpt3-encoder.php

+3-311
Original file line numberDiff line numberDiff line change
@@ -1,319 +1,11 @@
11
<?php
22

3-
function gpt_encode($text)
4-
{
5-
$bpe_tokens = array();
6-
if(empty($text))
7-
{
8-
return $bpe_tokens;
9-
}
10-
$raw_chars = file_get_contents(dirname(__FILE__) . "/characters.json");
11-
$byte_encoder = json_decode($raw_chars, true);
12-
if(empty($byte_encoder))
13-
{
14-
error_log('Failed to load characters.json: ' . $raw_chars);
15-
return $bpe_tokens;
16-
}
17-
$rencoder = file_get_contents(dirname(__FILE__) . "/encoder.json");
18-
$encoder = json_decode($rencoder, true);
19-
if(empty($encoder))
20-
{
21-
error_log('Failed to load encoder.json: ' . $rencoder);
22-
return $bpe_tokens;
23-
}
3+
require_once __DIR__.'/vendor/autoload.php';
244

25-
$bpe_file = file_get_contents(dirname(__FILE__) . "/vocab.bpe");
26-
if(empty($bpe_file))
27-
{
28-
error_log('Failed to load vocab.bpe');
29-
return $bpe_tokens;
30-
}
31-
32-
preg_match_all("#'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+#u", $text, $matches);
33-
if(!isset($matches[0]) || count($matches[0]) == 0)
34-
{
35-
error_log('Failed to match string: ' . $text);
36-
return $bpe_tokens;
37-
}
38-
$lines = preg_split('/\r\n|\r|\n/', $bpe_file);
39-
$bpe_merges = array();
40-
$bpe_merges_temp = array_slice($lines, 1, count($lines), true);
41-
foreach($bpe_merges_temp as $bmt)
42-
{
43-
$split_bmt = preg_split('#(\s+)#', $bmt);
44-
$split_bmt = array_filter($split_bmt, 'gpt_my_filter');
45-
if(count($split_bmt) > 0)
46-
{
47-
$bpe_merges[] = $split_bmt;
48-
}
49-
}
50-
$bpe_ranks = gpt_dictZip($bpe_merges, range(0, count($bpe_merges) - 1));
51-
52-
$cache = array();
53-
foreach($matches[0] as $token)
54-
{
55-
$new_tokens = array();
56-
$chars = array();
57-
$token = utf8_encode($token);
58-
if(function_exists('mb_strlen'))
59-
{
60-
$len = mb_strlen($token, 'UTF-8');
61-
for ($i = 0; $i < $len; $i++)
62-
{
63-
$chars[] = mb_substr($token, $i, 1, 'UTF-8');
64-
}
65-
}
66-
else
67-
{
68-
$chars = str_split($token);
69-
}
70-
$result_word = '';
71-
foreach($chars as $char)
72-
{
73-
if(isset($byte_encoder[gpt_unichr($char)]))
74-
{
75-
$result_word .= $byte_encoder[gpt_unichr($char)];
76-
}
77-
}
78-
$new_tokens_bpe = gpt_bpe($result_word, $bpe_ranks, $cache);
79-
$new_tokens_bpe = explode(' ', $new_tokens_bpe);
80-
foreach($new_tokens_bpe as $x)
81-
{
82-
if(isset($encoder[$x]))
83-
{
84-
$new_tokens[$x] = $encoder[$x];
85-
}
86-
else
87-
{
88-
$new_tokens[$x] = $x;
89-
}
90-
}
91-
foreach($new_tokens as $ninx => $nval)
92-
{
93-
if(isset($bpe_tokens[$ninx]))
94-
{
95-
$bpe_tokens[] = $nval;
96-
}
97-
else
98-
{
99-
$bpe_tokens[$ninx] = $nval;
100-
}
101-
}
102-
}
103-
return $bpe_tokens;
104-
}
105-
106-
function gpt_my_filter($var)
107-
{
108-
return ($var !== NULL && $var !== FALSE && $var !== '');
109-
}
110-
111-
function gpt_unichr($c)
112-
{
113-
if (ord($c[0]) >=0 && ord($c[0]) <= 127)
114-
{
115-
return ord($c[0]);
116-
}
117-
if (ord($c[0]) >= 192 && ord($c[0]) <= 223)
118-
{
119-
return (ord($c[0])-192)*64 + (ord($c[1])-128);
120-
}
121-
if (ord($c[0]) >= 224 && ord($c[0]) <= 239)
122-
{
123-
return (ord($c[0])-224)*4096 + (ord($c[1])-128)*64 + (ord($c[2])-128);
124-
}
125-
if (ord($c[0]) >= 240 && ord($c[0]) <= 247)
126-
{
127-
return (ord($c[0])-240)*262144 + (ord($c[1])-128)*4096 + (ord($c[2])-128)*64 + (ord($c[3])-128);
128-
}
129-
if (ord($c[0]) >= 248 && ord($c[0]) <= 251)
130-
{
131-
return (ord($c[0])-248)*16777216 + (ord($c[1])-128)*262144 + (ord($c[2])-128)*4096 + (ord($c[3])-128)*64 + (ord($c[4])-128);
132-
}
133-
if (ord($c[0]) >= 252 && ord($c[0]) <= 253)
134-
{
135-
return (ord($c[0])-252)*1073741824 + (ord($c[1])-128)*16777216 + (ord($c[2])-128)*262144 + (ord($c[3])-128)*4096 + (ord($c[4])-128)*64 + (ord($c[5])-128);
136-
}
137-
if (ord($c[0]) >= 254 && ord($c[0]) <= 255)
138-
{
139-
return 0;
140-
}
141-
return 0;
142-
}
143-
function gpt_dictZip($x, $y)
144-
{
145-
$result = array();
146-
$cnt = 0;
147-
foreach($x as $i)
148-
{
149-
if(isset($i[1]) && isset($i[0]))
150-
{
151-
$result[$i[0] . ',' . $i[1]] = $cnt;
152-
$cnt++;
153-
}
154-
}
155-
return $result;
156-
}
157-
function gpt_get_pairs($word)
158-
{
159-
$pairs = array();
160-
$prev_char = $word[0];
161-
for ($i = 1; $i < count($word); $i++)
162-
{
163-
$char = $word[$i];
164-
$pairs[] = array($prev_char, $char);
165-
$prev_char = $char;
166-
}
167-
return $pairs;
168-
}
169-
function gpt_split($str, $len = 1)
170-
{
171-
$arr = [];
172-
if(function_exists('mb_strlen'))
173-
{
174-
$length = mb_strlen($str, 'UTF-8');
175-
}
176-
else
177-
{
178-
$length = strlen($str);
179-
}
180-
181-
for ($i = 0; $i < $length; $i += $len)
182-
{
183-
if(function_exists('mb_substr'))
184-
{
185-
$arr[] = mb_substr($str, $i, $len, 'UTF-8');
186-
}
187-
else
188-
{
189-
$arr[] = substr($str, $i, $len);
190-
}
191-
}
192-
return $arr;
193-
194-
}
195-
function gpt_bpe($token, $bpe_ranks, &$cache)
196-
{
197-
if(array_key_exists($token, $cache))
198-
{
199-
return $cache[$token];
200-
}
201-
$word = gpt_split($token);
202-
$init_len = count($word);
203-
$pairs = gpt_get_pairs($word);
204-
if(!$pairs)
205-
{
206-
return $token;
207-
}
208-
while (true)
209-
{
210-
$minPairs = array();
211-
foreach($pairs as $pair)
212-
{
213-
if(array_key_exists($pair[0] . ','. $pair[1], $bpe_ranks))
214-
{
215-
$rank = $bpe_ranks[$pair[0] . ','. $pair[1]];
216-
$minPairs[$rank] = $pair;
217-
}
218-
else
219-
{
220-
$minPairs[10e10] = $pair;
221-
}
222-
}
223-
ksort($minPairs);
224-
$min_key = array_key_first($minPairs);
225-
foreach($minPairs as $mpi => $mp)
226-
{
227-
if($mpi < $min_key)
228-
{
229-
$min_key = $mpi;
230-
}
231-
}
232-
$bigram = $minPairs[$min_key];
233-
if(!array_key_exists($bigram[0] . ',' . $bigram[1], $bpe_ranks))
234-
{
235-
break;
236-
}
237-
$first = $bigram[0];
238-
$second = $bigram[1];
239-
$new_word = array();
240-
$i = 0;
241-
while ($i < count($word))
242-
{
243-
$j = gpt_indexOf($word, $first, $i);
244-
if ($j === -1)
245-
{
246-
$new_word = array_merge($new_word, array_slice($word, $i, null, true));
247-
break;
248-
}
249-
if($i > $j)
250-
{
251-
$slicer = array();
252-
}
253-
elseif($j == 0)
254-
{
255-
$slicer = array();
256-
}
257-
else
258-
{
259-
$slicer = array_slice($word, $i, $j - $i, true);
260-
}
261-
$new_word = array_merge($new_word, $slicer);
262-
if(count($new_word) > $init_len)
263-
{
264-
break;
265-
}
266-
$i = $j;
267-
if ($word[$i] === $first && $i < count($word) - 1 && $word[$i + 1] === $second)
268-
{
269-
array_push($new_word, $first . $second);
270-
$i = $i + 2;
271-
}
272-
else
273-
{
274-
array_push($new_word, $word[$i]);
275-
$i = $i + 1;
276-
}
277-
}
278-
if($word == $new_word)
279-
{
280-
break;
281-
}
282-
$word = $new_word;
283-
if (count($word) === 1)
284-
{
285-
break;
286-
}
287-
else
288-
{
289-
$pairs = gpt_get_pairs($word);
290-
}
291-
}
292-
$word = implode(' ', $word);
293-
$cache[$token] = $word;
294-
return $word;
295-
}
296-
function gpt_indexOf($arrax, $searchElement, $fromIndex)
297-
{
298-
$index = 0;
299-
foreach($arrax as $index => $value)
300-
{
301-
if($index < $fromIndex)
302-
{
303-
$index++;
304-
continue;
305-
}
306-
if($value == $searchElement)
307-
{
308-
return $index;
309-
}
310-
$index++;
311-
}
312-
return -1;
313-
}
5+
use CodeRevolutionPlugins\GPT3Encoder\Encoder;
3146

3157
$prompt = "Many words map to one token, but some don't: indivisible. Unicode characters like emojis may be split into many tokens containing the underlying bytes: 🤚🏾 Sequences of characters commonly found next to each other may be grouped together: 1234567890";
316-
$token_array = gpt_encode($prompt);
8+
$token_array = Encoder::instance()->encode($prompt);
3179
error_log('Token array: ' . print_r($token_array, true));
31810
error_log('Count: ' . count($token_array));
31911

0 commit comments

Comments
 (0)