|
1 | 1 | <?php
|
2 | 2 |
|
3 |
| -function gpt_encode($text) |
4 |
| -{ |
5 |
| - $bpe_tokens = array(); |
6 |
| - if(empty($text)) |
7 |
| - { |
8 |
| - return $bpe_tokens; |
9 |
| - } |
10 |
| - $raw_chars = file_get_contents(dirname(__FILE__) . "/characters.json"); |
11 |
| - $byte_encoder = json_decode($raw_chars, true); |
12 |
| - if(empty($byte_encoder)) |
13 |
| - { |
14 |
| - error_log('Failed to load characters.json: ' . $raw_chars); |
15 |
| - return $bpe_tokens; |
16 |
| - } |
17 |
| - $rencoder = file_get_contents(dirname(__FILE__) . "/encoder.json"); |
18 |
| - $encoder = json_decode($rencoder, true); |
19 |
| - if(empty($encoder)) |
20 |
| - { |
21 |
| - error_log('Failed to load encoder.json: ' . $rencoder); |
22 |
| - return $bpe_tokens; |
23 |
| - } |
| 3 | +require_once __DIR__.'/vendor/autoload.php'; |
24 | 4 |
|
25 |
| - $bpe_file = file_get_contents(dirname(__FILE__) . "/vocab.bpe"); |
26 |
| - if(empty($bpe_file)) |
27 |
| - { |
28 |
| - error_log('Failed to load vocab.bpe'); |
29 |
| - return $bpe_tokens; |
30 |
| - } |
31 |
| - |
32 |
| - preg_match_all("#'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+#u", $text, $matches); |
33 |
| - if(!isset($matches[0]) || count($matches[0]) == 0) |
34 |
| - { |
35 |
| - error_log('Failed to match string: ' . $text); |
36 |
| - return $bpe_tokens; |
37 |
| - } |
38 |
| - $lines = preg_split('/\r\n|\r|\n/', $bpe_file); |
39 |
| - $bpe_merges = array(); |
40 |
| - $bpe_merges_temp = array_slice($lines, 1, count($lines), true); |
41 |
| - foreach($bpe_merges_temp as $bmt) |
42 |
| - { |
43 |
| - $split_bmt = preg_split('#(\s+)#', $bmt); |
44 |
| - $split_bmt = array_filter($split_bmt, 'gpt_my_filter'); |
45 |
| - if(count($split_bmt) > 0) |
46 |
| - { |
47 |
| - $bpe_merges[] = $split_bmt; |
48 |
| - } |
49 |
| - } |
50 |
| - $bpe_ranks = gpt_dictZip($bpe_merges, range(0, count($bpe_merges) - 1)); |
51 |
| - |
52 |
| - $cache = array(); |
53 |
| - foreach($matches[0] as $token) |
54 |
| - { |
55 |
| - $new_tokens = array(); |
56 |
| - $chars = array(); |
57 |
| - $token = utf8_encode($token); |
58 |
| - if(function_exists('mb_strlen')) |
59 |
| - { |
60 |
| - $len = mb_strlen($token, 'UTF-8'); |
61 |
| - for ($i = 0; $i < $len; $i++) |
62 |
| - { |
63 |
| - $chars[] = mb_substr($token, $i, 1, 'UTF-8'); |
64 |
| - } |
65 |
| - } |
66 |
| - else |
67 |
| - { |
68 |
| - $chars = str_split($token); |
69 |
| - } |
70 |
| - $result_word = ''; |
71 |
| - foreach($chars as $char) |
72 |
| - { |
73 |
| - if(isset($byte_encoder[gpt_unichr($char)])) |
74 |
| - { |
75 |
| - $result_word .= $byte_encoder[gpt_unichr($char)]; |
76 |
| - } |
77 |
| - } |
78 |
| - $new_tokens_bpe = gpt_bpe($result_word, $bpe_ranks, $cache); |
79 |
| - $new_tokens_bpe = explode(' ', $new_tokens_bpe); |
80 |
| - foreach($new_tokens_bpe as $x) |
81 |
| - { |
82 |
| - if(isset($encoder[$x])) |
83 |
| - { |
84 |
| - $new_tokens[$x] = $encoder[$x]; |
85 |
| - } |
86 |
| - else |
87 |
| - { |
88 |
| - $new_tokens[$x] = $x; |
89 |
| - } |
90 |
| - } |
91 |
| - foreach($new_tokens as $ninx => $nval) |
92 |
| - { |
93 |
| - if(isset($bpe_tokens[$ninx])) |
94 |
| - { |
95 |
| - $bpe_tokens[] = $nval; |
96 |
| - } |
97 |
| - else |
98 |
| - { |
99 |
| - $bpe_tokens[$ninx] = $nval; |
100 |
| - } |
101 |
| - } |
102 |
| - } |
103 |
| - return $bpe_tokens; |
104 |
| -} |
105 |
| - |
106 |
| -function gpt_my_filter($var) |
107 |
| -{ |
108 |
| - return ($var !== NULL && $var !== FALSE && $var !== ''); |
109 |
| -} |
110 |
| - |
111 |
| -function gpt_unichr($c) |
112 |
| -{ |
113 |
| - if (ord($c[0]) >=0 && ord($c[0]) <= 127) |
114 |
| - { |
115 |
| - return ord($c[0]); |
116 |
| - } |
117 |
| - if (ord($c[0]) >= 192 && ord($c[0]) <= 223) |
118 |
| - { |
119 |
| - return (ord($c[0])-192)*64 + (ord($c[1])-128); |
120 |
| - } |
121 |
| - if (ord($c[0]) >= 224 && ord($c[0]) <= 239) |
122 |
| - { |
123 |
| - return (ord($c[0])-224)*4096 + (ord($c[1])-128)*64 + (ord($c[2])-128); |
124 |
| - } |
125 |
| - if (ord($c[0]) >= 240 && ord($c[0]) <= 247) |
126 |
| - { |
127 |
| - return (ord($c[0])-240)*262144 + (ord($c[1])-128)*4096 + (ord($c[2])-128)*64 + (ord($c[3])-128); |
128 |
| - } |
129 |
| - if (ord($c[0]) >= 248 && ord($c[0]) <= 251) |
130 |
| - { |
131 |
| - return (ord($c[0])-248)*16777216 + (ord($c[1])-128)*262144 + (ord($c[2])-128)*4096 + (ord($c[3])-128)*64 + (ord($c[4])-128); |
132 |
| - } |
133 |
| - if (ord($c[0]) >= 252 && ord($c[0]) <= 253) |
134 |
| - { |
135 |
| - return (ord($c[0])-252)*1073741824 + (ord($c[1])-128)*16777216 + (ord($c[2])-128)*262144 + (ord($c[3])-128)*4096 + (ord($c[4])-128)*64 + (ord($c[5])-128); |
136 |
| - } |
137 |
| - if (ord($c[0]) >= 254 && ord($c[0]) <= 255) |
138 |
| - { |
139 |
| - return 0; |
140 |
| - } |
141 |
| - return 0; |
142 |
| -} |
143 |
| -function gpt_dictZip($x, $y) |
144 |
| -{ |
145 |
| - $result = array(); |
146 |
| - $cnt = 0; |
147 |
| - foreach($x as $i) |
148 |
| - { |
149 |
| - if(isset($i[1]) && isset($i[0])) |
150 |
| - { |
151 |
| - $result[$i[0] . ',' . $i[1]] = $cnt; |
152 |
| - $cnt++; |
153 |
| - } |
154 |
| - } |
155 |
| - return $result; |
156 |
| -} |
157 |
| -function gpt_get_pairs($word) |
158 |
| -{ |
159 |
| - $pairs = array(); |
160 |
| - $prev_char = $word[0]; |
161 |
| - for ($i = 1; $i < count($word); $i++) |
162 |
| - { |
163 |
| - $char = $word[$i]; |
164 |
| - $pairs[] = array($prev_char, $char); |
165 |
| - $prev_char = $char; |
166 |
| - } |
167 |
| - return $pairs; |
168 |
| -} |
169 |
| -function gpt_split($str, $len = 1) |
170 |
| -{ |
171 |
| - $arr = []; |
172 |
| - if(function_exists('mb_strlen')) |
173 |
| - { |
174 |
| - $length = mb_strlen($str, 'UTF-8'); |
175 |
| - } |
176 |
| - else |
177 |
| - { |
178 |
| - $length = strlen($str); |
179 |
| - } |
180 |
| - |
181 |
| - for ($i = 0; $i < $length; $i += $len) |
182 |
| - { |
183 |
| - if(function_exists('mb_substr')) |
184 |
| - { |
185 |
| - $arr[] = mb_substr($str, $i, $len, 'UTF-8'); |
186 |
| - } |
187 |
| - else |
188 |
| - { |
189 |
| - $arr[] = substr($str, $i, $len); |
190 |
| - } |
191 |
| - } |
192 |
| - return $arr; |
193 |
| - |
194 |
| -} |
195 |
| -function gpt_bpe($token, $bpe_ranks, &$cache) |
196 |
| -{ |
197 |
| - if(array_key_exists($token, $cache)) |
198 |
| - { |
199 |
| - return $cache[$token]; |
200 |
| - } |
201 |
| - $word = gpt_split($token); |
202 |
| - $init_len = count($word); |
203 |
| - $pairs = gpt_get_pairs($word); |
204 |
| - if(!$pairs) |
205 |
| - { |
206 |
| - return $token; |
207 |
| - } |
208 |
| - while (true) |
209 |
| - { |
210 |
| - $minPairs = array(); |
211 |
| - foreach($pairs as $pair) |
212 |
| - { |
213 |
| - if(array_key_exists($pair[0] . ','. $pair[1], $bpe_ranks)) |
214 |
| - { |
215 |
| - $rank = $bpe_ranks[$pair[0] . ','. $pair[1]]; |
216 |
| - $minPairs[$rank] = $pair; |
217 |
| - } |
218 |
| - else |
219 |
| - { |
220 |
| - $minPairs[10e10] = $pair; |
221 |
| - } |
222 |
| - } |
223 |
| - ksort($minPairs); |
224 |
| - $min_key = array_key_first($minPairs); |
225 |
| - foreach($minPairs as $mpi => $mp) |
226 |
| - { |
227 |
| - if($mpi < $min_key) |
228 |
| - { |
229 |
| - $min_key = $mpi; |
230 |
| - } |
231 |
| - } |
232 |
| - $bigram = $minPairs[$min_key]; |
233 |
| - if(!array_key_exists($bigram[0] . ',' . $bigram[1], $bpe_ranks)) |
234 |
| - { |
235 |
| - break; |
236 |
| - } |
237 |
| - $first = $bigram[0]; |
238 |
| - $second = $bigram[1]; |
239 |
| - $new_word = array(); |
240 |
| - $i = 0; |
241 |
| - while ($i < count($word)) |
242 |
| - { |
243 |
| - $j = gpt_indexOf($word, $first, $i); |
244 |
| - if ($j === -1) |
245 |
| - { |
246 |
| - $new_word = array_merge($new_word, array_slice($word, $i, null, true)); |
247 |
| - break; |
248 |
| - } |
249 |
| - if($i > $j) |
250 |
| - { |
251 |
| - $slicer = array(); |
252 |
| - } |
253 |
| - elseif($j == 0) |
254 |
| - { |
255 |
| - $slicer = array(); |
256 |
| - } |
257 |
| - else |
258 |
| - { |
259 |
| - $slicer = array_slice($word, $i, $j - $i, true); |
260 |
| - } |
261 |
| - $new_word = array_merge($new_word, $slicer); |
262 |
| - if(count($new_word) > $init_len) |
263 |
| - { |
264 |
| - break; |
265 |
| - } |
266 |
| - $i = $j; |
267 |
| - if ($word[$i] === $first && $i < count($word) - 1 && $word[$i + 1] === $second) |
268 |
| - { |
269 |
| - array_push($new_word, $first . $second); |
270 |
| - $i = $i + 2; |
271 |
| - } |
272 |
| - else |
273 |
| - { |
274 |
| - array_push($new_word, $word[$i]); |
275 |
| - $i = $i + 1; |
276 |
| - } |
277 |
| - } |
278 |
| - if($word == $new_word) |
279 |
| - { |
280 |
| - break; |
281 |
| - } |
282 |
| - $word = $new_word; |
283 |
| - if (count($word) === 1) |
284 |
| - { |
285 |
| - break; |
286 |
| - } |
287 |
| - else |
288 |
| - { |
289 |
| - $pairs = gpt_get_pairs($word); |
290 |
| - } |
291 |
| - } |
292 |
| - $word = implode(' ', $word); |
293 |
| - $cache[$token] = $word; |
294 |
| - return $word; |
295 |
| -} |
296 |
| -function gpt_indexOf($arrax, $searchElement, $fromIndex) |
297 |
| -{ |
298 |
| - $index = 0; |
299 |
| - foreach($arrax as $index => $value) |
300 |
| - { |
301 |
| - if($index < $fromIndex) |
302 |
| - { |
303 |
| - $index++; |
304 |
| - continue; |
305 |
| - } |
306 |
| - if($value == $searchElement) |
307 |
| - { |
308 |
| - return $index; |
309 |
| - } |
310 |
| - $index++; |
311 |
| - } |
312 |
| - return -1; |
313 |
| -} |
| 5 | +use CodeRevolutionPlugins\GPT3Encoder\Encoder; |
314 | 6 |
|
315 | 7 | $prompt = "Many words map to one token, but some don't: indivisible. Unicode characters like emojis may be split into many tokens containing the underlying bytes: 🤚🏾 Sequences of characters commonly found next to each other may be grouped together: 1234567890";
|
316 |
| -$token_array = gpt_encode($prompt); |
| 8 | +$token_array = Encoder::instance()->encode($prompt); |
317 | 9 | error_log('Token array: ' . print_r($token_array, true));
|
318 | 10 | error_log('Count: ' . count($token_array));
|
319 | 11 |
|
|
0 commit comments