root/trunk/rms-web/kses.php

Revision 4526, 18.9 kB (checked in by erijo, 2 years ago)

Removed svn:keywords from all files that don't need it. May make your
checkout a tiny bit faster :)

  • Property svn:eol-style set to native
Line 
1<?php
2
3# kses 0.2.2 - HTML/XHTML filter that only allows some elements and attributes
4# Copyright (C) 2002, 2003, 2005  Ulf Harnhammar
5#
6# This program is free software and open source software; you can redistribute
7# it and/or modify it under the terms of the GNU General Public License as
8# published by the Free Software Foundation; either version 2 of the License,
9# or (at your option) any later version.
10#
11# This program is distributed in the hope that it will be useful, but WITHOUT
12# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
14# more details.
15#
16# You should have received a copy of the GNU General Public License along
17# with this program; if not, write to the Free Software Foundation, Inc.,
18# 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA  or visit
19# http://www.gnu.org/licenses/gpl.html
20#
21# *** CONTACT INFORMATION ***
22#
23# E-mail:      metaur at users dot sourceforge dot net
24# Web page:    http://sourceforge.net/projects/kses
25# Paper mail:  Ulf Harnhammar
26#              Ymergatan 17 C
27#              753 25  Uppsala
28#              SWEDEN
29#
30# [kses strips evil scripts!]
31
32
33function kses($string, $allowed_html, $allowed_protocols =
34               array('http', 'https', 'ftp', 'news', 'nntp', 'telnet',
35                     'gopher', 'mailto'))
36###############################################################################
37# This function makes sure that only the allowed HTML element names, attribute
38# names and attribute values plus only sane HTML entities will occur in
39# $string. You have to remove any slashes from PHP's magic quotes before you
40# call this function.
41###############################################################################
42{
43  $string = kses_no_null($string);
44  $string = kses_js_entities($string);
45  $string = kses_normalize_entities($string);
46  $string = kses_hook($string);
47  $allowed_html_fixed = kses_array_lc($allowed_html);
48  return kses_split($string, $allowed_html_fixed, $allowed_protocols);
49} # function kses
50
51
52function kses_hook($string)
53###############################################################################
54# You add any kses hooks here.
55###############################################################################
56{
57  return $string;
58} # function kses_hook
59
60
61function kses_version()
62###############################################################################
63# This function returns kses' version number.
64###############################################################################
65{
66  return '0.2.2';
67} # function kses_version
68
69
70function kses_split($string, $allowed_html, $allowed_protocols)
71###############################################################################
72# This function searches for HTML tags, no matter how malformed. It also
73# matches stray ">" characters.
74###############################################################################
75{
76  return preg_replace('%(<'.   # EITHER: <
77                      '[^>]*'. # things that aren't >
78                      '(>|$)'. # > or end of string
79                      '|>)%e', # OR: just a >
80                      "kses_split2('\\1', \$allowed_html, ".
81                      '$allowed_protocols)',
82                      $string);
83} # function kses_split
84
85
86function kses_split2($string, $allowed_html, $allowed_protocols)
87###############################################################################
88# This function does a lot of work. It rejects some very malformed things
89# like <:::>. It returns an empty string, if the element isn't allowed (look
90# ma, no strip_tags()!). Otherwise it splits the tag into an element and an
91# attribute list.
92###############################################################################
93{
94  $string = kses_stripslashes($string);
95
96  if (substr($string, 0, 1) != '<')
97    return '&gt;';
98    # It matched a ">" character
99
100  if (!preg_match('%^<\s*(/\s*)?([a-zA-Z0-9]+)([^>]*)>?$%', $string, $matches))
101    return '';
102    # It's seriously malformed
103
104  $slash = trim($matches[1]);
105  $elem = $matches[2];
106  $attrlist = $matches[3];
107
108  if (!@isset($allowed_html[strtolower($elem)]))
109    return '';
110    # They are using a not allowed HTML element
111
112  if ($slash != '')
113    return "<$slash$elem>";
114  # No attributes are allowed for closing elements
115
116  return kses_attr("$slash$elem", $attrlist, $allowed_html,
117                   $allowed_protocols);
118} # function kses_split2
119
120
121function kses_attr($element, $attr, $allowed_html, $allowed_protocols)
122###############################################################################
123# This function removes all attributes, if none are allowed for this element.
124# If some are allowed it calls kses_hair() to split them further, and then it
125# builds up new HTML code from the data that kses_hair() returns. It also
126# removes "<" and ">" characters, if there are any left. One more thing it
127# does is to check if the tag has a closing XHTML slash, and if it does,
128# it puts one in the returned code as well.
129###############################################################################
130{
131# Is there a closing XHTML slash at the end of the attributes?
132
133  $xhtml_slash = '';
134  if (preg_match('%\s/\s*$%', $attr))
135    $xhtml_slash = ' /';
136
137# Are any attributes allowed at all for this element?
138
139  if (@count($allowed_html[strtolower($element)]) == 0)
140    return "<$element$xhtml_slash>";
141
142# Split it
143
144  $attrarr = kses_hair($attr, $allowed_protocols);
145
146# Go through $attrarr, and save the allowed attributes for this element
147# in $attr2
148
149  $attr2 = '';
150
151  foreach ($attrarr as $arreach)
152  {
153    if (!@isset($allowed_html[strtolower($element)]
154                            [strtolower($arreach['name'])]))
155      continue; # the attribute is not allowed
156
157    $current = $allowed_html[strtolower($element)]
158                            [strtolower($arreach['name'])];
159
160    if (!is_array($current))
161      $attr2 .= ' '.$arreach['whole'];
162    # there are no checks
163
164    else
165    {
166    # there are some checks
167      $ok = true;
168      foreach ($current as $currkey => $currval)
169        if (!kses_check_attr_val($arreach['value'], $arreach['vless'],
170                                 $currkey, $currval))
171        { $ok = false; break; }
172
173      if ($ok)
174        $attr2 .= ' '.$arreach['whole']; # it passed them
175    } # if !is_array($current)
176  } # foreach
177
178# Remove any "<" or ">" characters
179
180  $attr2 = preg_replace('/[<>]/', '', $attr2);
181
182  return "<$element$attr2$xhtml_slash>";
183} # function kses_attr
184
185
186function kses_hair($attr, $allowed_protocols)
187###############################################################################
188# This function does a lot of work. It parses an attribute list into an array
189# with attribute data, and tries to do the right thing even if it gets weird
190# input. It will add quotes around attribute values that don't have any quotes
191# or apostrophes around them, to make it easier to produce HTML code that will
192# conform to W3C's HTML specification. It will also remove bad URL protocols
193# from attribute values.
194###############################################################################
195{
196  $attrarr = array();
197  $mode = 0;
198  $attrname = '';
199
200# Loop through the whole attribute list
201
202  while (strlen($attr) != 0)
203  {
204    $working = 0; # Was the last operation successful?
205
206    switch ($mode)
207    {
208      case 0: # attribute name, href for instance
209
210        if (preg_match('/^([-a-zA-Z]+)/', $attr, $match))
211        {
212          $attrname = $match[1];
213          $working = $mode = 1;
214          $attr = preg_replace('/^[-a-zA-Z]+/', '', $attr);
215        }
216
217        break;
218
219      case 1: # equals sign or valueless ("selected")
220
221        if (preg_match('/^\s*=\s*/', $attr)) # equals sign
222        {
223          $working = 1; $mode = 2;
224          $attr = preg_replace('/^\s*=\s*/', '', $attr);
225          break;
226        }
227
228        if (preg_match('/^\s+/', $attr)) # valueless
229        {
230          $working = 1; $mode = 0;
231          $attrarr[] = array
232                        ('name'  => $attrname,
233                         'value' => '',
234                         'whole' => $attrname,
235                         'vless' => 'y');
236          $attr = preg_replace('/^\s+/', '', $attr);
237        }
238
239        break;
240
241      case 2: # attribute value, a URL after href= for instance
242
243        if (preg_match('/^"([^"]*)"(\s+|$)/', $attr, $match))
244         # "value"
245        {
246          $thisval = kses_bad_protocol($match[1], $allowed_protocols);
247
248          $attrarr[] = array
249                        ('name'  => $attrname,
250                         'value' => $thisval,
251                         'whole' => "$attrname=\"$thisval\"",
252                         'vless' => 'n');
253          $working = 1; $mode = 0;
254          $attr = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr);
255          break;
256        }
257
258        if (preg_match("/^'([^']*)'(\s+|$)/", $attr, $match))
259         # 'value'
260        {
261          $thisval = kses_bad_protocol($match[1], $allowed_protocols);
262
263          $attrarr[] = array
264                        ('name'  => $attrname,
265                         'value' => $thisval,
266                         'whole' => "$attrname='$thisval'",
267                         'vless' => 'n');
268          $working = 1; $mode = 0;
269          $attr = preg_replace("/^'[^']*'(\s+|$)/", '', $attr);
270          break;
271        }
272
273        if (preg_match("%^([^\s\"']+)(\s+|$)%", $attr, $match))
274         # value
275        {
276          $thisval = kses_bad_protocol($match[1], $allowed_protocols);
277
278          $attrarr[] = array
279                        ('name'  => $attrname,
280                         'value' => $thisval,
281                         'whole' => "$attrname=\"$thisval\"",
282                         'vless' => 'n');
283                         # We add quotes to conform to W3C's HTML spec.
284          $working = 1; $mode = 0;
285          $attr = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attr);
286        }
287
288        break;
289    } # switch
290
291    if ($working == 0) # not well formed, remove and try again
292    {
293      $attr = kses_html_error($attr);
294      $mode = 0;
295    }
296  } # while
297
298  if ($mode == 1)
299  # special case, for when the attribute list ends with a valueless
300  # attribute like "selected"
301    $attrarr[] = array
302                  ('name'  => $attrname,
303                   'value' => '',
304                   'whole' => $attrname,
305                   'vless' => 'y');
306
307  return $attrarr;
308} # function kses_hair
309
310
311function kses_check_attr_val($value, $vless, $checkname, $checkvalue)
312###############################################################################
313# This function performs different checks for attribute values. The currently
314# implemented checks are "maxlen", "minlen", "maxval", "minval" and "valueless"
315# with even more checks to come soon.
316###############################################################################
317{
318  $ok = true;
319
320  switch (strtolower($checkname))
321  {
322    case 'maxlen':
323    # The maxlen check makes sure that the attribute value has a length not
324    # greater than the given value. This can be used to avoid Buffer Overflows
325    # in WWW clients and various Internet servers.
326
327      if (strlen($value) > $checkvalue)
328        $ok = false;
329      break;
330
331    case 'minlen':
332    # The minlen check makes sure that the attribute value has a length not
333    # smaller than the given value.
334
335      if (strlen($value) < $checkvalue)
336        $ok = false;
337      break;
338
339    case 'maxval':
340    # The maxval check does two things: it checks that the attribute value is
341    # an integer from 0 and up, without an excessive amount of zeroes or
342    # whitespace (to avoid Buffer Overflows). It also checks that the attribute
343    # value is not greater than the given value.
344    # This check can be used to avoid Denial of Service attacks.
345
346      if (!preg_match('/^\s{0,6}[0-9]{1,6}\s{0,6}$/', $value))
347        $ok = false;
348      if ($value > $checkvalue)
349        $ok = false;
350      break;
351
352    case 'minval':
353    # The minval check checks that the attribute value is a positive integer,
354    # and that it is not smaller than the given value.
355
356      if (!preg_match('/^\s{0,6}[0-9]{1,6}\s{0,6}$/', $value))
357        $ok = false;
358      if ($value < $checkvalue)
359        $ok = false;
360      break;
361
362    case 'valueless':
363    # The valueless check checks if the attribute has a value
364    # (like <a href="blah">) or not (<option selected>). If the given value
365    # is a "y" or a "Y", the attribute must not have a value.
366    # If the given value is an "n" or an "N", the attribute must have one.
367
368      if (strtolower($checkvalue) != $vless)
369        $ok = false;
370      break;
371  } # switch
372
373  return $ok;
374} # function kses_check_attr_val
375
376
377function kses_bad_protocol($string, $allowed_protocols)
378###############################################################################
379# This function removes all non-allowed protocols from the beginning of
380# $string. It ignores whitespace and the case of the letters, and it does
381# understand HTML entities. It does its work in a while loop, so it won't be
382# fooled by a string like "javascript:javascript:alert(57)".
383###############################################################################
384{
385  $string = kses_no_null($string);
386  $string = preg_replace('/\xad+/', '', $string); # deals with Opera "feature"
387  $string2 = $string.'a';
388
389  while ($string != $string2)
390  {
391    $string2 = $string;
392    $string = kses_bad_protocol_once($string, $allowed_protocols);
393  } # while
394
395  return $string;
396} # function kses_bad_protocol
397
398
399function kses_no_null($string)
400###############################################################################
401# This function removes any NULL characters in $string.
402###############################################################################
403{
404  $string = preg_replace('/\0+/', '', $string);
405  $string = preg_replace('/(\\\\0)+/', '', $string);
406
407  return $string;
408} # function kses_no_null
409
410
411function kses_stripslashes($string)
412###############################################################################
413# This function changes the character sequence  \"  to just  "
414# It leaves all other slashes alone. It's really weird, but the quoting from
415# preg_replace(//e) seems to require this.
416###############################################################################
417{
418  return preg_replace('%\\\\"%', '"', $string);
419} # function kses_stripslashes
420
421
422function kses_array_lc($inarray)
423###############################################################################
424# This function goes through an array, and changes the keys to all lower case.
425###############################################################################
426{
427  $outarray = array();
428
429  foreach ($inarray as $inkey => $inval)
430  {
431    $outkey = strtolower($inkey);
432    $outarray[$outkey] = array();
433
434    foreach ($inval as $inkey2 => $inval2)
435    {
436      $outkey2 = strtolower($inkey2);
437      $outarray[$outkey][$outkey2] = $inval2;
438    } # foreach $inval
439  } # foreach $inarray
440
441  return $outarray;
442} # function kses_array_lc
443
444
445function kses_js_entities($string)
446###############################################################################
447# This function removes the HTML JavaScript entities found in early versions of
448# Netscape 4.
449###############################################################################
450{
451  return preg_replace('%&\s*\{[^}]*(\}\s*;?|$)%', '', $string);
452} # function kses_js_entities
453
454
455function kses_html_error($string)
456###############################################################################
457# This function deals with parsing errors in kses_hair(). The general plan is
458# to remove everything to and including some whitespace, but it deals with
459# quotes and apostrophes as well.
460###############################################################################
461{
462  return preg_replace('/^("[^"]*("|$)|\'[^\']*(\'|$)|\S)*\s*/', '', $string);
463} # function kses_html_error
464
465
466function kses_bad_protocol_once($string, $allowed_protocols)
467###############################################################################
468# This function searches for URL protocols at the beginning of $string, while
469# handling whitespace and HTML entities.
470###############################################################################
471{
472  return preg_replace('/^((&[^;]*;|[\sA-Za-z0-9])*)'.
473                      '(:|&#58;|&#[Xx]3[Aa];)\s*/e',
474                      'kses_bad_protocol_once2("\\1", $allowed_protocols)',
475                      $string);
476} # function kses_bad_protocol_once
477
478
479function kses_bad_protocol_once2($string, $allowed_protocols)
480###############################################################################
481# This function processes URL protocols, checks to see if they're in the white-
482# list or not, and returns different data depending on the answer.
483###############################################################################
484{
485  $string2 = kses_decode_entities($string);
486  $string2 = preg_replace('/\s/', '', $string2);
487  $string2 = kses_no_null($string2);
488  $string2 = preg_replace('/\xad+/', '', $string2);
489   # deals with Opera "feature"
490  $string2 = strtolower($string2);
491
492  $allowed = false;
493  foreach ($allowed_protocols as $one_protocol)
494    if (strtolower($one_protocol) == $string2)
495    {
496      $allowed = true;
497      break;
498    }
499
500  if ($allowed)
501    return "$string2:";
502  else
503    return '';
504} # function kses_bad_protocol_once2
505
506
507function kses_normalize_entities($string)
508###############################################################################
509# This function normalizes HTML entities. It will convert "AT&T" to the correct
510# "AT&amp;T", "&#00058;" to "&#58;", "&#XYZZY;" to "&amp;#XYZZY;" and so on.
511###############################################################################
512{
513# Disarm all entities by converting & to &amp;
514
515  $string = str_replace('&', '&amp;', $string);
516
517# Change back the allowed entities in our entity whitelist
518
519  $string = preg_replace('/&amp;([A-Za-z][A-Za-z0-9]{0,19});/',
520                         '&\\1;', $string);
521  $string = preg_replace('/&amp;#0*([0-9]{1,5});/e',
522                         'kses_normalize_entities2("\\1")', $string);
523  $string = preg_replace('/&amp;#([Xx])0*(([0-9A-Fa-f]{2}){1,2});/',
524                         '&#\\1\\2;', $string);
525
526  return $string;
527} # function kses_normalize_entities
528
529
530function kses_normalize_entities2($i)
531###############################################################################
532# This function helps kses_normalize_entities() to only accept 16 bit values
533# and nothing more for &#number; entities.
534###############################################################################
535{
536  return (($i > 65535) ? "&amp;#$i;" : "&#$i;");
537} # function kses_normalize_entities2
538
539
540function kses_decode_entities($string)
541###############################################################################
542# This function decodes numeric HTML entities (&#65; and &#x41;). It doesn't
543# do anything with other entities like &auml;, but we don't need them in the
544# URL protocol whitelisting system anyway.
545###############################################################################
546{
547  $string = preg_replace('/&#([0-9]+);/e', 'chr("\\1")', $string);
548  $string = preg_replace('/&#[Xx]([0-9A-Fa-f]+);/e', 'chr(hexdec("\\1"))',
549                         $string);
550
551  return $string;
552} # function kses_decode_entities
553
554?>
Note: See TracBrowser for help on using the browser.