2
|
1 #include "autolink.h" |
|
2 |
|
3 #include <string.h> |
|
4 #include <stdlib.h> |
|
5 #include <stdio.h> |
|
6 #include <ctype.h> |
|
7 |
|
8 #ifndef _MSC_VER |
|
9 #include <strings.h> |
|
10 #else |
|
11 #define strncasecmp _strnicmp |
|
12 #endif |
|
13 |
|
14 int |
|
15 hoedown_autolink_is_safe(const uint8_t *data, size_t size) |
|
16 { |
|
17 static const size_t valid_uris_count = 6; |
|
18 static const char *valid_uris[] = { |
|
19 "http://", "https://", "/", "#", "ftp://", "mailto:" |
|
20 }; |
|
21 static const size_t valid_uris_size[] = { 7, 8, 1, 1, 6, 7 }; |
|
22 size_t i; |
|
23 |
|
24 for (i = 0; i < valid_uris_count; ++i) { |
|
25 size_t len = valid_uris_size[i]; |
|
26 |
|
27 if (size > len && |
|
28 strncasecmp((char *)data, valid_uris[i], len) == 0 && |
|
29 isalnum(data[len])) |
|
30 return 1; |
|
31 } |
|
32 |
|
33 return 0; |
|
34 } |
|
35 |
|
36 static size_t |
|
37 autolink_delim(uint8_t *data, size_t link_end, size_t max_rewind, size_t size) |
|
38 { |
|
39 uint8_t cclose, copen = 0; |
|
40 size_t i; |
|
41 |
|
42 for (i = 0; i < link_end; ++i) |
|
43 if (data[i] == '<') { |
|
44 link_end = i; |
|
45 break; |
|
46 } |
|
47 |
|
48 while (link_end > 0) { |
|
49 if (strchr("?!.,:", data[link_end - 1]) != NULL) |
|
50 link_end--; |
|
51 |
|
52 else if (data[link_end - 1] == ';') { |
|
53 size_t new_end = link_end - 2; |
|
54 |
|
55 while (new_end > 0 && isalpha(data[new_end])) |
|
56 new_end--; |
|
57 |
|
58 if (new_end < link_end - 2 && data[new_end] == '&') |
|
59 link_end = new_end; |
|
60 else |
|
61 link_end--; |
|
62 } |
|
63 else break; |
|
64 } |
|
65 |
|
66 if (link_end == 0) |
|
67 return 0; |
|
68 |
|
69 cclose = data[link_end - 1]; |
|
70 |
|
71 switch (cclose) { |
|
72 case '"': copen = '"'; break; |
|
73 case '\'': copen = '\''; break; |
|
74 case ')': copen = '('; break; |
|
75 case ']': copen = '['; break; |
|
76 case '}': copen = '{'; break; |
|
77 } |
|
78 |
|
79 if (copen != 0) { |
|
80 size_t closing = 0; |
|
81 size_t opening = 0; |
|
82 size_t i = 0; |
|
83 |
|
84 /* Try to close the final punctuation sign in this same line; |
|
85 * if we managed to close it outside of the URL, that means that it's |
|
86 * not part of the URL. If it closes inside the URL, that means it |
|
87 * is part of the URL. |
|
88 * |
|
89 * Examples: |
|
90 * |
|
91 * foo http://www.pokemon.com/Pikachu_(Electric) bar |
|
92 * => http://www.pokemon.com/Pikachu_(Electric) |
|
93 * |
|
94 * foo (http://www.pokemon.com/Pikachu_(Electric)) bar |
|
95 * => http://www.pokemon.com/Pikachu_(Electric) |
|
96 * |
|
97 * foo http://www.pokemon.com/Pikachu_(Electric)) bar |
|
98 * => http://www.pokemon.com/Pikachu_(Electric)) |
|
99 * |
|
100 * (foo http://www.pokemon.com/Pikachu_(Electric)) bar |
|
101 * => foo http://www.pokemon.com/Pikachu_(Electric) |
|
102 */ |
|
103 |
|
104 while (i < link_end) { |
|
105 if (data[i] == copen) |
|
106 opening++; |
|
107 else if (data[i] == cclose) |
|
108 closing++; |
|
109 |
|
110 i++; |
|
111 } |
|
112 |
|
113 if (closing != opening) |
|
114 link_end--; |
|
115 } |
|
116 |
|
117 return link_end; |
|
118 } |
|
119 |
|
120 static size_t |
|
121 check_domain(uint8_t *data, size_t size, int allow_short) |
|
122 { |
|
123 size_t i, np = 0; |
|
124 |
|
125 if (!isalnum(data[0])) |
|
126 return 0; |
|
127 |
|
128 for (i = 1; i < size - 1; ++i) { |
|
129 if (strchr(".:", data[i]) != NULL) np++; |
|
130 else if (!isalnum(data[i]) && data[i] != '-') break; |
|
131 } |
|
132 |
|
133 if (allow_short) { |
|
134 /* We don't need a valid domain in the strict sense (with |
|
135 * least one dot; so just make sure it's composed of valid |
|
136 * domain characters and return the length of the the valid |
|
137 * sequence. */ |
|
138 return i; |
|
139 } else { |
|
140 /* a valid domain needs to have at least a dot. |
|
141 * that's as far as we get */ |
|
142 return np ? i : 0; |
|
143 } |
|
144 } |
|
145 |
|
146 size_t |
|
147 hoedown_autolink__www( |
|
148 size_t *rewind_p, |
|
149 hoedown_buffer *link, |
|
150 uint8_t *data, |
|
151 size_t max_rewind, |
|
152 size_t size, |
|
153 unsigned int flags) |
|
154 { |
|
155 size_t link_end; |
|
156 |
|
157 if (max_rewind > 0 && !ispunct(data[-1]) && !isspace(data[-1])) |
|
158 return 0; |
|
159 |
|
160 if (size < 4 || memcmp(data, "www.", strlen("www.")) != 0) |
|
161 return 0; |
|
162 |
|
163 link_end = check_domain(data, size, 0); |
|
164 |
|
165 if (link_end == 0) |
|
166 return 0; |
|
167 |
|
168 while (link_end < size && !isspace(data[link_end])) |
|
169 link_end++; |
|
170 |
|
171 link_end = autolink_delim(data, link_end, max_rewind, size); |
|
172 |
|
173 if (link_end == 0) |
|
174 return 0; |
|
175 |
|
176 hoedown_buffer_put(link, data, link_end); |
|
177 *rewind_p = 0; |
|
178 |
|
179 return (int)link_end; |
|
180 } |
|
181 |
|
182 size_t |
|
183 hoedown_autolink__email( |
|
184 size_t *rewind_p, |
|
185 hoedown_buffer *link, |
|
186 uint8_t *data, |
|
187 size_t max_rewind, |
|
188 size_t size, |
|
189 unsigned int flags) |
|
190 { |
|
191 size_t link_end, rewind; |
|
192 int nb = 0, np = 0; |
|
193 |
|
194 for (rewind = 0; rewind < max_rewind; ++rewind) { |
|
195 uint8_t c = data[-1 - rewind]; |
|
196 |
|
197 if (isalnum(c)) |
|
198 continue; |
|
199 |
|
200 if (strchr(".+-_", c) != NULL) |
|
201 continue; |
|
202 |
|
203 break; |
|
204 } |
|
205 |
|
206 if (rewind == 0) |
|
207 return 0; |
|
208 |
|
209 for (link_end = 0; link_end < size; ++link_end) { |
|
210 uint8_t c = data[link_end]; |
|
211 |
|
212 if (isalnum(c)) |
|
213 continue; |
|
214 |
|
215 if (c == '@') |
|
216 nb++; |
|
217 else if (c == '.' && link_end < size - 1) |
|
218 np++; |
|
219 else if (c != '-' && c != '_') |
|
220 break; |
|
221 } |
|
222 |
|
223 if (link_end < 2 || nb != 1 || np == 0 || |
|
224 !isalpha(data[link_end - 1])) |
|
225 return 0; |
|
226 |
|
227 link_end = autolink_delim(data, link_end, max_rewind, size); |
|
228 |
|
229 if (link_end == 0) |
|
230 return 0; |
|
231 |
|
232 hoedown_buffer_put(link, data - rewind, link_end + rewind); |
|
233 *rewind_p = rewind; |
|
234 |
|
235 return link_end; |
|
236 } |
|
237 |
|
238 size_t |
|
239 hoedown_autolink__url( |
|
240 size_t *rewind_p, |
|
241 hoedown_buffer *link, |
|
242 uint8_t *data, |
|
243 size_t max_rewind, |
|
244 size_t size, |
|
245 unsigned int flags) |
|
246 { |
|
247 size_t link_end, rewind = 0, domain_len; |
|
248 |
|
249 if (size < 4 || data[1] != '/' || data[2] != '/') |
|
250 return 0; |
|
251 |
|
252 while (rewind < max_rewind && isalpha(data[-1 - rewind])) |
|
253 rewind++; |
|
254 |
|
255 if (!hoedown_autolink_is_safe(data - rewind, size + rewind)) |
|
256 return 0; |
|
257 |
|
258 link_end = strlen("://"); |
|
259 |
|
260 domain_len = check_domain( |
|
261 data + link_end, |
|
262 size - link_end, |
|
263 flags & HOEDOWN_AUTOLINK_SHORT_DOMAINS); |
|
264 |
|
265 if (domain_len == 0) |
|
266 return 0; |
|
267 |
|
268 link_end += domain_len; |
|
269 while (link_end < size && !isspace(data[link_end])) |
|
270 link_end++; |
|
271 |
|
272 link_end = autolink_delim(data, link_end, max_rewind, size); |
|
273 |
|
274 if (link_end == 0) |
|
275 return 0; |
|
276 |
|
277 hoedown_buffer_put(link, data - rewind, link_end + rewind); |
|
278 *rewind_p = rewind; |
|
279 |
|
280 return link_end; |
|
281 } |