root/trunk/inc/URI.pm

Revision 93, 32.4 kB (checked in by clinton, 4 years ago)

Bundle Test::WWW::Mechanize and (most) of its dependencies to allow testing, tid10737 tid10892

Line 
1 package URI;
2
3 use strict;
4 use vars qw($VERSION);
5 $VERSION = "1.51";
6
7 use vars qw($ABS_REMOTE_LEADING_DOTS $ABS_ALLOW_RELATIVE_SCHEME $DEFAULT_QUERY_FORM_DELIMITER);
8
9 my %implements;  # mapping from scheme to implementor class
10
11 # Some "official" character classes
12
13 use vars qw($reserved $mark $unreserved $uric $scheme_re);
14 $reserved   = q(;/?:@&=+$,[]);
15 $mark       = q(-_.!~*'());                                    #'; emacs
16 $unreserved = "A-Za-z0-9\Q$mark\E";
17 $uric       = quotemeta($reserved) . $unreserved . "%";
18
19 $scheme_re  = '[a-zA-Z][a-zA-Z0-9.+\-]*';
20
21 use Carp ();
22 use URI::Escape ();
23
24 use overload ('""'     => sub { ${$_[0]} },
25               '=='     => sub { _obj_eq(@_) },
26               '!='     => sub { !_obj_eq(@_) },
27               fallback => 1,
28              );
29
30 # Check if two objects are the same object
31 sub _obj_eq {
32     return overload::StrVal($_[0]) eq overload::StrVal($_[1]);
33 }
34
35 sub new
36 {
37     my($class, $uri, $scheme) = @_;
38
39     $uri = defined ($uri) ? "$uri" : "";   # stringify
40     # Get rid of potential wrapping
41     $uri =~ s/^<(?:URL:)?(.*)>$/$1/;  #
42     $uri =~ s/^"(.*)"$/$1/;
43     $uri =~ s/^\s+//;
44     $uri =~ s/\s+$//;
45
46     my $impclass;
47     if ($uri =~ m/^($scheme_re):/so) {
48         $scheme = $1;
49     }
50     else {
51         if (($impclass = ref($scheme))) {
52             $scheme = $scheme->scheme;
53         }
54         elsif ($scheme && $scheme =~ m/^($scheme_re)(?::|$)/o) {
55             $scheme = $1;
56         }
57     }
58     $impclass ||= implementor($scheme) ||
59         do {
60             require URI::_foreign;
61             $impclass = 'URI::_foreign';
62         };
63
64     return $impclass->_init($uri, $scheme);
65 }
66
67
68 sub new_abs
69 {
70     my($class, $uri, $base) = @_;
71     $uri = $class->new($uri, $base);
72     $uri->abs($base);
73 }
74
75
76 sub _init
77 {
78     my $class = shift;
79     my($str, $scheme) = @_;
80     # find all funny characters and encode the bytes.
81     $str = $class->_uric_escape($str);
82     $str = "$scheme:$str" unless $str =~ /^$scheme_re:/o ||
83                                  $class->_no_scheme_ok;
84     my $self = bless \$str, $class;
85     $self;
86 }
87
88
89 sub _uric_escape
90 {
91     my($class, $str) = @_;
92     $str =~ s*([^$uric\#])* URI::Escape::escape_char($1) *ego;
93     return $str;
94 }
95
96
97 sub implementor
98 {
99     my($scheme, $impclass) = @_;
100     if (!$scheme || $scheme !~ /\A$scheme_re\z/o) {
101         require URI::_generic;
102         return "URI::_generic";
103     }
104
105     $scheme = lc($scheme);
106
107     if ($impclass) {
108         # Set the implementor class for a given scheme
109         my $old = $implements{$scheme};
110         $impclass->_init_implementor($scheme);
111         $implements{$scheme} = $impclass;
112         return $old;
113     }
114
115     my $ic = $implements{$scheme};
116     return $ic if $ic;
117
118     # scheme not yet known, look for internal or
119     # preloaded (with 'use') implementation
120     $ic = "URI::$scheme";  # default location
121
122     # turn scheme into a valid perl identifier by a simple transformation...
123     $ic =~ s/\+/_P/g;
124     $ic =~ s/\./_O/g;
125     $ic =~ s/\-/_/g;
126
127     no strict 'refs';
128     # check we actually have one for the scheme:
129     unless (@{"${ic}::ISA"}) {
130         # Try to load it
131         eval "require $ic";
132         die $@ if $@ && $@ !~ /Can\'t locate.*in \@INC/;
133         return unless @{"${ic}::ISA"};
134     }
135
136     $ic->_init_implementor($scheme);
137     $implements{$scheme} = $ic;
138     $ic;
139 }
140
141
142 sub _init_implementor
143 {
144     my($class, $scheme) = @_;
145     # Remember that one implementor class may actually
146     # serve to implement several URI schemes.
147 }
148
149
150 sub clone
151 {
152     my $self = shift;
153     my $other = $$self;
154     bless \$other, ref $self;
155 }
156
157
158 sub _no_scheme_ok { 0 }
159
160 sub _scheme
161 {
162     my $self = shift;
163
164     unless (@_) {
165         return unless $$self =~ /^($scheme_re):/o;
166         return $1;
167     }
168
169     my $old;
170     my $new = shift;
171     if (defined($new) && length($new)) {
172         Carp::croak("Bad scheme '$new'") unless $new =~ /^$scheme_re$/o;
173         $old = $1 if $$self =~ s/^($scheme_re)://o;
174         my $newself = URI->new("$new:$$self");
175         $$self = $$newself;
176         bless $self, ref($newself);
177     }
178     else {
179         if ($self->_no_scheme_ok) {
180             $old = $1 if $$self =~ s/^($scheme_re)://o;
181             Carp::carp("Oops, opaque part now look like scheme")
182                 if $^W && $$self =~ m/^$scheme_re:/o
183         }
184         else {
185             $old = $1 if $$self =~ m/^($scheme_re):/o;
186         }
187     }
188
189     return $old;
190 }
191
192 sub scheme
193 {
194     my $scheme = shift->_scheme(@_);
195     return unless defined $scheme;
196     lc($scheme);
197 }
198
199
200 sub opaque
201 {
202     my $self = shift;
203
204     unless (@_) {
205         $$self =~ /^(?:$scheme_re:)?([^\#]*)/o or die;
206         return $1;
207     }
208
209     $$self =~ /^($scheme_re:)?    # optional scheme
210                 ([^\#]*)          # opaque
211                 (\#.*)?           # optional fragment
212               $/sx or die;
213
214     my $old_scheme = $1;
215     my $old_opaque = $2;
216     my $old_frag   = $3;
217
218     my $new_opaque = shift;
219     $new_opaque = "" unless defined $new_opaque;
220     $new_opaque =~ s/([^$uric])/ URI::Escape::escape_char($1)/ego;
221
222     $$self = defined($old_scheme) ? $old_scheme : "";
223     $$self .= $new_opaque;
224     $$self .= $old_frag if defined $old_frag;
225
226     $old_opaque;
227 }
228
229 *path = \&opaque;  # alias
230
231
232 sub fragment
233 {
234     my $self = shift;
235     unless (@_) {
236         return unless $$self =~ /\#(.*)/s;
237         return $1;
238     }
239
240     my $old;
241     $old = $1 if $$self =~ s/\#(.*)//s;
242
243     my $new_frag = shift;
244     if (defined $new_frag) {
245         $new_frag =~ s/([^$uric])/ URI::Escape::escape_char($1) /ego;
246         $$self .= "#$new_frag";
247     }
248     $old;
249 }
250
251
252 sub as_string
253 {
254     my $self = shift;
255     $$self;
256 }
257
258
259 sub as_iri
260 {
261     my $self = shift;
262     my $str = $$self;
263     if ($str =~ /\bxn--/ && $self->can("ihost")) {
264         my $ihost = $self->ihost;
265         if ($ihost) {
266             my $u = $self->clone;
267             $u->host("%%host%%");
268             $str = $u->as_string;
269             $str =~ s/%%host%%/$ihost/;
270         }
271     }
272     if ($str =~ s/%([89A-F][0-9A-F])/chr(hex($1))/eg) {
273         # All this crap because the more obvious:
274         #
275         #   Encode::decode("UTF-8", $str, sub { sprintf "%%%02X", shift })
276         #
277         # doesn't work.  Apparently passing a sub as CHECK only works
278         # for 'ascii' and similar direct encodings.
279
280         require Encode;
281         my $enc = Encode::find_encoding("UTF-8");
282         my $u = "";
283         while (length $str) {
284             $u .= $enc->decode($str, Encode::FB_QUIET());
285             if (length $str) {
286                 # escape next char
287                 $u .= URI::Escape::escape_char(substr($str, 0, 1, ""));
288             }
289         }
290         $str = $u;
291     }
292     return $str;
293 }
294
295
296 sub canonical
297 {
298     # Make sure scheme is lowercased, that we don't escape unreserved chars,
299     # and that we use upcase escape sequences.
300
301     my $self = shift;
302     my $scheme = $self->_scheme || "";
303     my $uc_scheme = $scheme =~ /[A-Z]/;
304     my $esc = $$self =~ /%[a-fA-F0-9]{2}/;
305     return $self unless $uc_scheme || $esc;
306
307     my $other = $self->clone;
308     if ($uc_scheme) {
309         $other->_scheme(lc $scheme);
310     }
311     if ($esc) {
312         $$other =~ s{%([0-9a-fA-F]{2})}
313                     { my $a = chr(hex($1));
314                       $a =~ /^[$unreserved]\z/o ? $a : "%\U$1"
315                     }ge;
316     }
317     return $other;
318 }
319
320 # Compare two URIs, subclasses will provide a more correct implementation
321 sub eq {
322     my($self, $other) = @_;
323     $self  = URI->new($self, $other) unless ref $self;
324     $other = URI->new($other, $self) unless ref $other;
325     ref($self) eq ref($other) &&                # same class
326         $self->canonical->as_string eq $other->canonical->as_string;
327 }
328
329 # generic-URI transformation methods
330 sub abs { $_[0]; }
331 sub rel { $_[0]; }
332
333 # help out Storable
334 sub STORABLE_freeze {
335        my($self, $cloning) = @_;
336        return $$self;
337 }
338
339 sub STORABLE_thaw {
340        my($self, $cloning, $str) = @_;
341        $$self = $str;
342 }
343
344 1;
345
346 __END__
347
348 =head1 NAME
349
350 URI - Uniform Resource Identifiers (absolute and relative)
351
352 =head1 SYNOPSIS
353
354  $u1 = URI->new("http://www.perl.com");
355  $u2 = URI->new("foo", "http");
356  $u3 = $u2->abs($u1);
357  $u4 = $u3->clone;
358  $u5 = URI->new("HTTP://WWW.perl.com:80")->canonical;
359
360  $str = $u->as_string;
361  $str = "$u";
362
363  $scheme = $u->scheme;
364  $opaque = $u->opaque;
365  $path   = $u->path;
366  $frag   = $u->fragment;
367
368  $u->scheme("ftp");
369  $u->host("ftp.perl.com");
370  $u->path("cpan/");
371
372 =head1 DESCRIPTION
373
374 This module implements the C<URI> class.  Objects of this class
375 represent "Uniform Resource Identifier references" as specified in RFC
376 2396 (and updated by RFC 2732).
377
378 A Uniform Resource Identifier is a compact string of characters that
379 identifies an abstract or physical resource.  A Uniform Resource
380 Identifier can be further classified as either a Uniform Resource Locator
381 (URL) or a Uniform Resource Name (URN).  The distinction between URL
382 and URN does not matter to the C<URI> class interface. A
383 "URI-reference" is a URI that may have additional information attached
384 in the form of a fragment identifier.
385
386 An absolute URI reference consists of three parts:  a I<scheme>, a
387 I<scheme-specific part> and a I<fragment> identifier.  A subset of URI
388 references share a common syntax for hierarchical namespaces.  For
389 these, the scheme-specific part is further broken down into
390 I<authority>, I<path> and I<query> components.  These URIs can also
391 take the form of relative URI references, where the scheme (and
392 usually also the authority) component is missing, but implied by the
393 context of the URI reference.  The three forms of URI reference
394 syntax are summarized as follows:
395
396   <scheme>:<scheme-specific-part>#<fragment>
397   <scheme>://<authority><path>?<query>#<fragment>
398   <path>?<query>#<fragment>
399
400 The components into which a URI reference can be divided depend on the
401 I<scheme>.  The C<URI> class provides methods to get and set the
402 individual components.  The methods available for a specific
403 C<URI> object depend on the scheme.
404
405 =head1 CONSTRUCTORS
406
407 The following methods construct new C<URI> objects:
408
409 =over 4
410
411 =item $uri = URI->new( $str )
412
413 =item $uri = URI->new( $str, $scheme )
414
415 Constructs a new URI object.  The string
416 representation of a URI is given as argument, together with an optional
417 scheme specification.  Common URI wrappers like "" and <>, as well as
418 leading and trailing white space, are automatically removed from
419 the $str argument before it is processed further.
420
421 The constructor determines the scheme, maps this to an appropriate
422 URI subclass, constructs a new object of that class and returns it.
423
424 The $scheme argument is only used when $str is a
425 relative URI.  It can be either a simple string that
426 denotes the scheme, a string containing an absolute URI reference, or
427 an absolute C<URI> object.  If no $scheme is specified for a relative
428 URI $str, then $str is simply treated as a generic URI (no scheme-specific
429 methods available).
430
431 The set of characters available for building URI references is
432 restricted (see L<URI::Escape>).  Characters outside this set are
433 automatically escaped by the URI constructor.
434
435 =item $uri = URI->new_abs( $str, $base_uri )
436
437 Constructs a new absolute URI object.  The $str argument can
438 denote a relative or absolute URI.  If relative, then it is
439 absolutized using $base_uri as base. The $base_uri must be an absolute
440 URI.
441
442 =item $uri = URI::file->new( $filename )
443
444 =item $uri = URI::file->new( $filename, $os )
445
446 Constructs a new I<file> URI from a file name.  See L<URI::file>.
447
448 =item $uri = URI::file->new_abs( $filename )
449
450 =item $uri = URI::file->new_abs( $filename, $os )
451
452 Constructs a new absolute I<file> URI from a file name.  See
453 L<URI::file>.
454
455 =item $uri = URI::file->cwd
456
457 Returns the current working directory as a I<file> URI.  See
458 L<URI::file>.
459
460 =item $uri->clone
461
462 Returns a copy of the $uri.
463
464 =back
465
466 =head1 COMMON METHODS
467
468 The methods described in this section are available for all C<URI>
469 objects.
470
471 Methods that give access to components of a URI always return the
472 old value of the component.  The value returned is C<undef> if the
473 component was not present.  There is generally a difference between a
474 component that is empty (represented as C<"">) and a component that is
475 missing (represented as C<undef>).  If an accessor method is given an
476 argument, it updates the corresponding component in addition to
477 returning the old value of the component.  Passing an undefined
478 argument removes the component (if possible).  The description of
479 each accessor method indicates whether the component is passed as
480 an escaped or an unescaped string.  A component that can be further
481 divided into sub-parts are usually passed escaped, as unescaping might
482 change its semantics.
483
484 The common methods available for all URI are:
485
486 =over 4
487
488 =item $uri->scheme
489
490 =item $uri->scheme( $new_scheme )
491
492 Sets and returns the scheme part of the $uri.  If the $uri is
493 relative, then $uri->scheme returns C<undef>.  If called with an
494 argument, it updates the scheme of $uri, possibly changing the
495 class of $uri, and returns the old scheme value.  The method croaks
496 if the new scheme name is illegal; a scheme name must begin with a
497 letter and must consist of only US-ASCII letters, numbers, and a few
498 special marks: ".", "+", "-".  This restriction effectively means
499 that the scheme must be passed unescaped.  Passing an undefined
500 argument to the scheme method makes the URI relative (if possible).
501
502 Letter case does not matter for scheme names.  The string
503 returned by $uri->scheme is always lowercase.  If you want the scheme
504 just as it was written in the URI in its original case,
505 you can use the $uri->_scheme method instead.
506
507 =item $uri->opaque
508
509 =item $uri->opaque( $new_opaque )
510
511 Sets and returns the scheme-specific part of the $uri
512 (everything between the scheme and the fragment)
513 as an escaped string.
514
515 =item $uri->path
516
517 =item $uri->path( $new_path )
518
519 Sets and returns the same value as $uri->opaque unless the URI
520 supports the generic syntax for hierarchical namespaces.
521 In that case the generic method is overridden to set and return
522 the part of the URI between the I<host name> and the I<fragment>.
523
524 =item $uri->fragment
525
526 =item $uri->fragment( $new_frag )
527
528 Returns the fragment identifier of a URI reference
529 as an escaped string.
530
531 =item $uri->as_string
532
533 Returns a URI object to a plain ASCII string.  URI objects are
534 also converted to plain strings automatically by overloading.  This
535 means that $uri objects can be used as plain strings in most Perl
536 constructs.
537
538 =item $uri->as_iri
539
540 Returns a Unicode string representing the URI.  Escaped UTF-8 sequences
541 representing non-ASCII characters are turned into their corresponding Unicode
542 code point.
543
544 =item $uri->canonical
545
546 Returns a normalized version of the URI.  The rules
547 for normalization are scheme-dependent.  They usually involve
548 lowercasing the scheme and Internet host name components,
549 removing the explicit port specification if it matches the default port,
550 uppercasing all escape sequences, and unescaping octets that can be
551 better represented as plain characters.
552
553 For efficiency reasons, if the $uri is already in normalized form,
554 then a reference to it is returned instead of a copy.
555
556 =item $uri->eq( $other_uri )
557
558 =item URI::eq( $first_uri, $other_uri )
559
560 Tests whether two URI references are equal.  URI references
561 that normalize to the same string are considered equal.  The method
562 can also be used as a plain function which can also test two string
563 arguments.
564
565 If you need to test whether two C<URI> object references denote the
566 same object, use the '==' operator.
567
568 =item $uri->abs( $base_uri )
569
570 Returns an absolute URI reference.  If $uri is already
571 absolute, then a reference to it is simply returned.  If the $uri
572 is relative, then a new absolute URI is constructed by combining the
573 $uri and the $base_uri, and returned.
574
575 =item $uri->rel( $base_uri )
576
577 Returns a relative URI reference if it is possible to
578 make one that denotes the same resource relative to $base_uri.
579 If not, then $uri is simply returned.
580
581 =back
582
583 =head1 GENERIC METHODS
584
585 The following methods are available to schemes that use the
586 common/generic syntax for hierarchical namespaces.  The descriptions of
587 schemes below indicate which these are.  Unknown schemes are
588 assumed to support the generic syntax, and therefore the following
589 methods:
590
591 =over 4
592
593 =item $uri->authority
594
595 =item $uri->authority( $new_authority )
596
597 Sets and returns the escaped authority component
598 of the $uri.
599
600 =item $uri->path
601
602 =item $uri->path( $new_path )
603
604 Sets and returns the escaped path component of
605 the $uri (the part between the host name and the query or fragment).
606 The path can never be undefined, but it can be the empty string.
607
608 =item $uri->path_query
609
610 =item $uri->path_query( $new_path_query )
611
612 Sets and returns the escaped path and query
613 components as a single entity.  The path and the query are
614 separated by a "?" character, but the query can itself contain "?".
615
616 =item $uri->path_segments
617
618 =item $uri->path_segments( $segment, ... )
619
620 Sets and returns the path.  In a scalar context, it returns
621 the same value as $uri->path.  In a list context, it returns the
622 unescaped path segments that make up the path.  Path segments that
623 have parameters are returned as an anonymous array.  The first element
624 is the unescaped path segment proper;  subsequent elements are escaped
625 parameter strings.  Such an anonymous array uses overloading so it can
626 be treated as a string too, but this string does not include the
627 parameters.
628
629 Note that absolute paths have the empty string as their first
630 I<path_segment>, i.e. the I<path> C</foo/bar> have 3
631 I<path_segments>; "", "foo" and "bar".
632
633 =item $uri->query
634
635 =item $uri->query( $new_query )
636
637 Sets and returns the escaped query component of
638 the $uri.
639
640 =item $uri->query_form
641
642 =item $uri->query_form( $key1 => $val1, $key2 => $val2, ... )
643
644 =item $uri->query_form( $key1 => $val1, $key2 => $val2, ..., $delim )
645
646 =item $uri->query_form( \@key_value_pairs )
647
648 =item $uri->query_form( \@key_value_pairs, $delim )
649
650 =item $uri->query_form( \%hash )
651
652 =item $uri->query_form( \%hash, $delim )
653
654 Sets and returns query components that use the
655 I<application/x-www-form-urlencoded> format.  Key/value pairs are
656 separated by "&", and the key is separated from the value by a "="
657 character.
658
659 The form can be set either by passing separate key/value pairs, or via
660 an array or hash reference.  Passing an empty array or an empty hash
661 removes the query component, whereas passing no arguments at all leaves
662 the component unchanged.  The order of keys is undefined if a hash
663 reference is passed.  The old value is always returned as a list of
664 separate key/value pairs.  Assigning this list to a hash is unwise as
665 the keys returned might repeat.
666
667 The values passed when setting the form can be plain strings or
668 references to arrays of strings.  Passing an array of values has the
669 same effect as passing the key repeatedly with one value at a time.
670 All the following statements have the same effect:
671
672     $uri->query_form(foo => 1, foo => 2);
673     $uri->query_form(foo => [1, 2]);
674     $uri->query_form([ foo => 1, foo => 2 ]);
675     $uri->query_form([ foo => [1, 2] ]);
676     $uri->query_form({ foo => [1, 2] });
677
678 The $delim parameter can be passed as ";" to force the key/value pairs
679 to be delimited by ";" instead of "&" in the query string.  This
680 practice is often recommended for URLs embedded in HTML or XML
681 documents as this avoids the trouble of escaping the "&" character.
682 You might also set the $URI::DEFAULT_QUERY_FORM_DELIMITER variable to
683 ";" for the same global effect.
684
685 The C<URI::QueryParam> module can be loaded to add further methods to
686 manipulate the form of a URI.  See L<URI::QueryParam> for details.
687
688 =item $uri->query_keywords
689
690 =item $uri->query_keywords( $keywords, ... )
691
692 =item $uri->query_keywords( \@keywords )
693
694 Sets and returns query components that use the
695 keywords separated by "+" format.
696
697 The keywords can be set either by passing separate keywords directly
698 or by passing a reference to an array of keywords.  Passing an empty
699 array removes the query component, whereas passing no arguments at
700 all leaves the component unchanged.  The old value is always returned
701 as a list of separate words.
702
703 =back
704
705 =head1 SERVER METHODS
706
707 For schemes where the I<authority> component denotes an Internet host,
708 the following methods are available in addition to the generic
709 methods.
710
711 =over 4
712
713 =item $uri->userinfo
714
715 =item $uri->userinfo( $new_userinfo )
716
717 Sets and returns the escaped userinfo part of the
718 authority component.
719
720 For some schemes this is a user name and a password separated by
721 a colon.  This practice is not recommended. Embedding passwords in
722 clear text (such as URI) has proven to be a security risk in almost
723 every case where it has been used.
724
725 =item $uri->host
726
727 =item $uri->host( $new_host )
728
729 Sets and returns the unescaped hostname.
730
731 If the $new_host string ends with a colon and a number, then this
732 number also sets the port.
733
734 For IPv6 addresses the brackets around the raw address is removed in the return
735 value from $uri->host.  When setting the host attribute to an IPv6 address you
736 can use a raw address or one enclosed in brackets.  The address needs to be
737 enclosed in brackets if you want to pass in a new port value as well.
738
739 =item $uri->ihost
740
741 Returns the host in Unicode form.  Any IDNA A-labels are turned into U-labels.
742
743 =item $uri->port
744
745 =item $uri->port( $new_port )
746
747 Sets and returns the port.  The port is a simple integer
748 that should be greater than 0.
749
750 If a port is not specified explicitly in the URI, then the URI scheme's default port
751 is returned. If you don't want the default port
752 substituted, then you can use the $uri->_port method instead.
753
754 =item $uri->host_port
755
756 =item $uri->host_port( $new_host_port )
757
758 Sets and returns the host and port as a single
759 unit.  The returned value includes a port, even if it matches the
760 default port.  The host part and the port part are separated by a
761 colon: ":".
762
763 For IPv6 addresses the bracketing is preserved; thus
764 URI->new("http://[::1]/")->host_port returns "[::1]:80".  Contrast this with
765 $uri->host which will remove the brackets.
766
767 =item $uri->default_port
768
769 Returns the default port of the URI scheme to which $uri
770 belongs.  For I<http> this is the number 80, for I<ftp> this
771 is the number 21, etc.  The default port for a scheme can not be
772 changed.
773
774 =back
775
776 =head1 SCHEME-SPECIFIC SUPPORT
777
778 Scheme-specific support is provided for the following URI schemes.  For C<URI>
779 objects that do not belong to one of these, you can only use the common and
780 generic methods.
781
782 =over 4
783
784 =item B<data>:
785
786 The I<data> URI scheme is specified in RFC 2397.  It allows inclusion
787 of small data items as "immediate" data, as if it had been included
788 externally.
789
790 C<URI> objects belonging to the data scheme support the common methods
791 and two new methods to access their scheme-specific components:
792 $uri->media_type and $uri->data.  See L<URI::data> for details.
793
794 =item B<file>:
795
796 An old specification of the I<file> URI scheme is found in RFC 1738.
797 A new RFC 2396 based specification in not available yet, but file URI
798 references are in common use.
799
800 C<URI> objects belonging to the file scheme support the common and
801 generic methods.  In addition, they provide two methods for mapping file URIs
802 back to local file names; $uri->file and $uri->dir.  See L<URI::file>
803 for details.
804
805 =item B<ftp>:
806
807 An old specification of the I<ftp> URI scheme is found in RFC 1738.  A
808 new RFC 2396 based specification in not available yet, but ftp URI
809 references are in common use.
810
811 C<URI> objects belonging to the ftp scheme support the common,
812 generic and server methods.  In addition, they provide two methods for
813 accessing the userinfo sub-components: $uri->user and $uri->password.
814
815 =item B<gopher>:
816
817 The I<gopher> URI scheme is specified in
818 <draft-murali-url-gopher-1996-12-04> and will hopefully be available
819 as a RFC 2396 based specification.
820
821 C<URI> objects belonging to the gopher scheme support the common,
822 generic and server methods. In addition, they support some methods for
823 accessing gopher-specific path components: $uri->gopher_type,
824 $uri->selector, $uri->search, $uri->string.
825
826 =item B<http>:
827
828 The I<http> URI scheme is specified in RFC 2616.
829 The scheme is used to reference resources hosted by HTTP servers.
830
831 C<URI> objects belonging to the http scheme support the common,
832 generic and server methods.
833
834 =item B<https>:
835
836 The I<https> URI scheme is a Netscape invention which is commonly
837 implemented.  The scheme is used to reference HTTP servers through SSL
838 connections.  Its syntax is the same as http, but the default
839 port is different.
840
841 =item B<ldap>:
842
843 The I<ldap> URI scheme is specified in RFC 2255.  LDAP is the
844 Lightweight Directory Access Protocol.  An ldap URI describes an LDAP
845 search operation to perform to retrieve information from an LDAP
846 directory.
847
848 C<URI> objects belonging to the ldap scheme support the common,
849 generic and server methods as well as ldap-specific methods: $uri->dn,
850 $uri->attributes, $uri->scope, $uri->filter, $uri->extensions.  See
851 L<URI::ldap> for details.
852
853 =item B<ldapi>:
854
855 Like the I<ldap> URI scheme, but uses a UNIX domain socket.  The
856 server methods are not supported, and the local socket path is
857 available as $uri->un_path.  The I<ldapi> scheme is used by the
858 OpenLDAP package.  There is no real specification for it, but it is
859 mentioned in various OpenLDAP manual pages.
860
861 =item B<ldaps>:
862
863 Like the I<ldap> URI scheme, but uses an SSL connection.  This
864 scheme is deprecated, as the preferred way is to use the I<start_tls>
865 mechanism.
866
867 =item B<mailto>:
868
869 The I<mailto> URI scheme is specified in RFC 2368.  The scheme was
870 originally used to designate the Internet mailing address of an
871 individual or service.  It has (in RFC 2368) been extended to allow
872 setting of other mail header fields and the message body.
873
874 C<URI> objects belonging to the mailto scheme support the common
875 methods and the generic query methods.  In addition, they support the
876 following mailto-specific methods: $uri->to, $uri->headers.
877
878 Note that the "foo@example.com" part of a mailto is I<not> the
879 C<userinfo> and C<host> but instead the C<path>.  This allowed a
880 mailto to contain multiple comma-seperated email addresses.
881
882 =item B<mms>:
883
884 The I<mms> URL specification can be found at L<http://sdp.ppona.com/>
885 C<URI> objects belonging to the mms scheme support the common,
886 generic, and server methods, with the exception of userinfo and
887 query-related sub-components.
888
889 =item B<news>:
890
891 The I<news>, I<nntp> and I<snews> URI schemes are specified in
892 <draft-gilman-news-url-01> and will hopefully be available as an RFC
893 2396 based specification soon.
894
895 C<URI> objects belonging to the news scheme support the common,
896 generic and server methods.  In addition, they provide some methods to
897 access the path: $uri->group and $uri->message.
898
899 =item B<nntp>:
900
901 See I<news> scheme.
902
903 =item B<pop>:
904
905 The I<pop> URI scheme is specified in RFC 2384. The scheme is used to
906 reference a POP3 mailbox.
907
908 C<URI> objects belonging to the pop scheme support the common, generic
909 and server methods.  In addition, they provide two methods to access the
910 userinfo components: $uri->user and $uri->auth
911
912 =item B<rlogin>:
913
914 An old specification of the I<rlogin> URI scheme is found in RFC
915 1738. C<URI> objects belonging to the rlogin scheme support the
916 common, generic and server methods.
917
918 =item B<rtsp>:
919
920 The I<rtsp> URL specification can be found in section 3.2 of RFC 2326.
921 C<URI> objects belonging to the rtsp scheme support the common,
922 generic, and server methods, with the exception of userinfo and
923 query-related sub-components.
924
925 =item B<rtspu>:
926
927 The I<rtspu> URI scheme is used to talk to RTSP servers over UDP
928 instead of TCP.  The syntax is the same as rtsp.
929
930 =item B<rsync>:
931
932 Information about rsync is available from http://rsync.samba.org.
933 C<URI> objects belonging to the rsync scheme support the common,
934 generic and server methods.  In addition, they provide methods to
935 access the userinfo sub-components: $uri->user and $uri->password.
936
937 =item B<sip>:
938
939 The I<sip> URI specification is described in sections 19.1 and 25
940 of RFC 3261.  C<URI> objects belonging to the sip scheme support the
941 common, generic, and server methods with the exception of path related
942 sub-components.  In addition, they provide two methods to get and set
943 I<sip> parameters: $uri->params_form and $uri->params.
944
945 =item B<sips>:
946
947 See I<sip> scheme.  Its syntax is the same as sip, but the default
948 port is different.
949
950 =item B<snews>:
951
952 See I<news> scheme.  Its syntax is the same as news, but the default
953 port is different.
954
955 =item B<telnet>:
956
957 An old specification of the I<telnet> URI scheme is found in RFC
958 1738. C<URI> objects belonging to the telnet scheme support the
959 common, generic and server methods.
960
961 =item B<tn3270>:
962
963 These URIs are used like I<telnet> URIs but for connections to IBM
964 mainframes.  C<URI> objects belonging to the tn3270 scheme support the
965 common, generic and server methods.
966
967 =item B<ssh>:
968
969 Information about ssh is available at http://www.openssh.com/.
970 C<URI> objects belonging to the ssh scheme support the common,
971 generic and server methods. In addition, they provide methods to
972 access the userinfo sub-components: $uri->user and $uri->password.
973
974 =item B<urn>:
975
976 The syntax of Uniform Resource Names is specified in RFC 2141.  C<URI>
977 objects belonging to the urn scheme provide the common methods, and also the
978 methods $uri->nid and $uri->nss, which return the Namespace Identifier
979 and the Namespace-Specific String respectively.
980
981 The Namespace Identifier basically works like the Scheme identifier of
982 URIs, and further divides the URN namespace.  Namespace Identifier
983 assignments are maintained at
984 <http://www.iana.org/assignments/urn-namespaces>.
985
986 Letter case is not significant for the Namespace Identifier.  It is
987 always returned in lower case by the $uri->nid method.  The $uri->_nid
988 method can be used if you want it in its original case.
989
990 =item B<urn>:B<isbn>:
991
992 The C<urn:isbn:> namespace contains International Standard Book
993 Numbers (ISBNs) and is described in RFC 3187.  A C<URI> object belonging
994 to this namespace has the following extra methods (if the
995 Business::ISBN module is available): $uri->isbn,
996 $uri->isbn_publisher_code, $uri->isbn_group_code (formerly isbn_country_code,
997 which is still supported by issues a deprecation warning), $uri->isbn_as_ean.
998
999 =item B<urn>:B<oid>:
1000
1001 The C<urn:oid:> namespace contains Object Identifiers (OIDs) and is
1002 described in RFC 3061.  An object identifier consists of sequences of digits
1003 separated by dots.  A C<URI> object belonging to this namespace has an
1004 additional method called $uri->oid that can be used to get/set the oid
1005 value.  In a list context, oid numbers are returned as separate elements.
1006
1007 =back
1008
1009 =head1 CONFIGURATION VARIABLES
1010
1011 The following configuration variables influence how the class and its
1012 methods behave:
1013
1014 =over 4
1015
1016 =item $URI::ABS_ALLOW_RELATIVE_SCHEME
1017
1018 Some older parsers used to allow the scheme name to be present in the
1019 relative URL if it was the same as the base URL scheme.  RFC 2396 says
1020 that this should be avoided, but you can enable this old behaviour by
1021 setting the $URI::ABS_ALLOW_RELATIVE_SCHEME variable to a TRUE value.
1022 The difference is demonstrated by the following examples:
1023
1024   URI->new("http:foo")->abs("http://host/a/b")
1025       ==>  "http:foo"
1026
1027   local $URI::ABS_ALLOW_RELATIVE_SCHEME = 1;
1028   URI->new("http:foo")->abs("http://host/a/b")
1029       ==>  "http:/host/a/foo"
1030
1031
1032 =item $URI::ABS_REMOTE_LEADING_DOTS
1033
1034 You can also have the abs() method ignore excess ".."
1035 segments in the relative URI by setting $URI::ABS_REMOTE_LEADING_DOTS
1036 to a TRUE value.  The difference is demonstrated by the following
1037 examples:
1038
1039   URI->new("../../../foo")->abs("http://host/a/b")
1040       ==> "http://host/../../foo"
1041
1042   local $URI::ABS_REMOTE_LEADING_DOTS = 1;
1043   URI->new("../../../foo")->abs("http://host/a/b")
1044       ==> "http://host/foo"
1045
1046 =item $URI::DEFAULT_QUERY_FORM_DELIMITER
1047
1048 This value can be set to ";" to have the query form C<key=value> pairs
1049 delimited by ";" instead of "&" which is the default.
1050
1051 =back
1052
1053 =head1 BUGS
1054
1055 Using regexp variables like $1 directly as arguments to the URI methods
1056 does not work too well with current perl implementations.  I would argue
1057 that this is actually a bug in perl.  The workaround is to quote
1058 them. Example:
1059
1060    /(...)/ || die;
1061    $u->query("$1");
1062
1063 =head1 PARSING URIs WITH REGEXP
1064
1065 As an alternative to this module, the following (official) regular
1066 expression can be used to decode a URI:
1067
1068   my($scheme, $authority, $path, $query, $fragment) =
1069   $uri =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
1070
1071 The C<URI::Split> module provides the function uri_split() as a
1072 readable alternative.
1073
1074 =head1 SEE ALSO
1075
1076 L<URI::file>, L<URI::WithBase>, L<URI::QueryParam>, L<URI::Escape>,
1077 L<URI::Split>, L<URI::Heuristic>
1078
1079 RFC 2396: "Uniform Resource Identifiers (URI): Generic Syntax",
1080 Berners-Lee, Fielding, Masinter, August 1998.
1081
1082 http://www.iana.org/assignments/uri-schemes
1083
1084 http://www.iana.org/assignments/urn-namespaces
1085
1086 http://www.w3.org/Addressing/
1087
1088 =head1 COPYRIGHT
1089
1090 Copyright 1995-2009 Gisle Aas.
1091
1092 Copyright 1995 Martijn Koster.
1093
1094 This program is free software; you can redistribute it and/or modify
1095 it under the same terms as Perl itself.
1096
1097 =head1 AUTHORS / ACKNOWLEDGMENTS
1098
1099 This module is based on the C<URI::URL> module, which in turn was
1100 (distantly) based on the C<wwwurl.pl> code in the libwww-perl for
1101 perl4 developed by Roy Fielding, as part of the Arcadia project at the
1102 University of California, Irvine, with contributions from Brooks
1103 Cutter.
1104
1105 C<URI::URL> was developed by Gisle Aas, Tim Bunce, Roy Fielding and
1106 Martijn Koster with input from other people on the libwww-perl mailing
1107 list.
1108
1109 C<URI> and related subclasses was developed by Gisle Aas.
1110
1111 =cut
Note: See TracBrowser for help on using the browser.