From 57c5b0a980c123f93082b168eff8ceab04085d2f Mon Sep 17 00:00:00 2001 From: Nobuyoshi Nakada Date: Thu, 13 Oct 2022 18:01:58 +0900 Subject: [ruby/uri] Refactor RFC3986 regexps to make more readable https://github.com/ruby/uri/commit/3dfa19e920 --- lib/uri/rfc3986_parser.rb | 82 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 71 insertions(+), 11 deletions(-) (limited to 'lib/uri/rfc3986_parser.rb') diff --git a/lib/uri/rfc3986_parser.rb b/lib/uri/rfc3986_parser.rb index dd24a409ea..8cc51f1286 100644 --- a/lib/uri/rfc3986_parser.rb +++ b/lib/uri/rfc3986_parser.rb @@ -1,9 +1,69 @@ -# frozen_string_literal: false +# frozen_string_literal: true module URI class RFC3986_Parser # :nodoc: # URI defined in RFC3986 - RFC3986_URI = /\A(?(?[A-Za-z][+\-.0-9A-Za-z]*+):(?\/\/(?(?:(?(?:%\h\h|[!$&-.0-;=A-Z_a-z~])*+)@)?(?(?\[(?:(?(?:\h{1,4}:){6}(?\h{1,4}:\h{1,4}|(?(?[1-9]\d|1\d{2}|2[0-4]\d|25[0-5]|\d)\.\g\.\g\.\g))|::(?:\h{1,4}:){5}\g|\h{1,4}?::(?:\h{1,4}:){4}\g|(?:(?:\h{1,4}:)?\h{1,4})?::(?:\h{1,4}:){3}\g|(?:(?:\h{1,4}:){,2}\h{1,4})?::(?:\h{1,4}:){2}\g|(?:(?:\h{1,4}:){,3}\h{1,4})?::\h{1,4}:\g|(?:(?:\h{1,4}:){,4}\h{1,4})?::\g|(?:(?:\h{1,4}:){,5}\h{1,4})?::\h{1,4}|(?:(?:\h{1,4}:){,6}\h{1,4})?::)|(?v\h++\.[!$&-.0-;=A-Z_a-z~]++))\])|\g|(?(?:%\h\h|[!$&-.0-9;=A-Z_a-z~])*+))(?::(?\d*+))?)(?(?:\/(?(?:%\h\h|[!$&-.0-;=@-Z_a-z~])*+))*+)|(?\/(?:(?(?:%\h\h|[!$&-.0-;=@-Z_a-z~])++)(?:\/\g)*+)?)|(?\g(?:\/\g)*+)|(?))(?:\?(?[^#]*+))?(?:\#(?(?:%\h\h|[!$&-.0-;=@-Z_a-z~\/?])*+))?)\z/ - RFC3986_relative_ref = /\A(?(?\/\/(?(?:(?(?:%\h\h|[!$&-.0-;=A-Z_a-z~])*+)@)?(?(?\[(?:(?(?:\h{1,4}:){6}(?\h{1,4}:\h{1,4}|(?(?[1-9]\d|1\d{2}|2[0-4]\d|25[0-5]|\d)\.\g\.\g\.\g))|::(?:\h{1,4}:){5}\g|\h{1,4}?::(?:\h{1,4}:){4}\g|(?:(?:\h{1,4}:){,1}\h{1,4})?::(?:\h{1,4}:){3}\g|(?:(?:\h{1,4}:){,2}\h{1,4})?::(?:\h{1,4}:){2}\g|(?:(?:\h{1,4}:){,3}\h{1,4})?::\h{1,4}:\g|(?:(?:\h{1,4}:){,4}\h{1,4})?::\g|(?:(?:\h{1,4}:){,5}\h{1,4})?::\h{1,4}|(?:(?:\h{1,4}:){,6}\h{1,4})?::)|(?v\h++\.[!$&-.0-;=A-Z_a-z~]++))\])|\g|(?(?:%\h\h|[!$&-.0-9;=A-Z_a-z~])++))?(?::(?\d*+))?)(?(?:\/(?(?:%\h\h|[!$&-.0-;=@-Z_a-z~])*+))*+)|(?\/(?:(?(?:%\h\h|[!$&-.0-;=@-Z_a-z~])++)(?:\/\g)*+)?)|(?(?(?:%\h\h|[!$&-.0-9;=@-Z_a-z~])++)(?:\/\g)*+)|(?))(?:\?(?[^#]*+))?(?:\#(?(?:%\h\h|[!$&-.0-;=@-Z_a-z~\/?])*+))?)\z/ + HOST = %r[ + (?\[(?: + (? + (?:\h{1,4}:){6} + (?\h{1,4}:\h{1,4} + | (?(?[1-9]\d|1\d{2}|2[0-4]\d|25[0-5]|\d) + \.\g\.\g\.\g) + ) + | ::(?:\h{1,4}:){5}\g + | \h{1,4}?::(?:\h{1,4}:){4}\g + | (?:(?:\h{1,4}:)?\h{1,4})?::(?:\h{1,4}:){3}\g + | (?:(?:\h{1,4}:){,2}\h{1,4})?::(?:\h{1,4}:){2}\g + | (?:(?:\h{1,4}:){,3}\h{1,4})?::\h{1,4}:\g + | (?:(?:\h{1,4}:){,4}\h{1,4})?::\g + | (?:(?:\h{1,4}:){,5}\h{1,4})?::\h{1,4} + | (?:(?:\h{1,4}:){,6}\h{1,4})?:: + ) + | (?v\h++\.[!$&-.0-9:;=A-Z_a-z~]++) + )\]) + | \g + | (?(?:%\h\h|[!$&-.0-9;=A-Z_a-z~])*+) + ]x + + USERINFO = /(?:%\h\h|[!$&-.0-9:;=A-Z_a-z~])*+/ + AUTHORITY = %r[ + (?:(?#{USERINFO.source})@)? + (?#{HOST.source.delete(" \n")}) + (?::(?\d*+))? + ]x + + SCHEME = %r[[A-Za-z][+\-.0-9A-Za-z]*+].source + SEG = %r[(?:%\h\h|[!$&-.0-9:;=@A-Z_a-z~/])].source + FRAGMENT = %r[(?:%\h\h|[!$&-.0-9:;=@A-Z_a-z~/?])*+].source + + RFC3986_URI = %r[\A + (?#{SEG}){0} + (? + (?#{SCHEME}): + (?// + (?#{AUTHORITY}) + (?(?:/\g*+)?) + | (?/\g*+) + | (?(?!=/)\g++) + | (?) + ) + (?:\?(?[^\#]*+))? + (?:\#(?#{FRAGMENT}))? + )\z]x + + RFC3986_relative_ref = %r[\A + (?#{SEG}){0} + (? + (?// + (?#{AUTHORITY}) + (?(?:/\g*+)?) + | (?/\g*+) + | (?(?!=[:/])\g++) + | (?) + ) + (?:\?(?[^#]*+))? + (?:\#(?#{FRAGMENT}))? + )\z]x attr_reader :regexp def initialize @@ -92,14 +152,14 @@ module URI def default_regexp # :nodoc: { - SCHEME: /\A[A-Za-z][A-Za-z0-9+\-.]*\z/, - USERINFO: /\A(?:%\h\h|[!$&-.0-;=A-Z_a-z~])*\z/, - HOST: /\A(?:(?\[(?:(?(?:\h{1,4}:){6}(?\h{1,4}:\h{1,4}|(?(?[1-9]\d|1\d{2}|2[0-4]\d|25[0-5]|\d)\.\g\.\g\.\g))|::(?:\h{1,4}:){5}\g|\h{,4}::(?:\h{1,4}:){4}\g|(?:(?:\h{1,4}:)?\h{1,4})?::(?:\h{1,4}:){3}\g|(?:(?:\h{1,4}:){,2}\h{1,4})?::(?:\h{1,4}:){2}\g|(?:(?:\h{1,4}:){,3}\h{1,4})?::\h{1,4}:\g|(?:(?:\h{1,4}:){,4}\h{1,4})?::\g|(?:(?:\h{1,4}:){,5}\h{1,4})?::\h{1,4}|(?:(?:\h{1,4}:){,6}\h{1,4})?::)|(?v\h+\.[!$&-.0-;=A-Z_a-z~]+))\])|\g|(?(?:%\h\h|[!$&-.0-9;=A-Z_a-z~])*))\z/, - ABS_PATH: /\A\/(?:%\h\h|[!$&-.0-;=@-Z_a-z~])*(?:\/(?:%\h\h|[!$&-.0-;=@-Z_a-z~])*)*\z/, - REL_PATH: /\A(?:%\h\h|[!$&-.0-;=@-Z_a-z~])+(?:\/(?:%\h\h|[!$&-.0-;=@-Z_a-z~])*)*\z/, - QUERY: /\A(?:%\h\h|[!$&-.0-;=@-Z_a-z~\/?])*\z/, - FRAGMENT: /\A(?:%\h\h|[!$&-.0-;=@-Z_a-z~\/?])*\z/, - OPAQUE: /\A(?:[^\/].*)?\z/, + SCHEME: %r[\A#{SCHEME}\z]o, + USERINFO: %r[\A#{USERINFO}\z]o, + HOST: %r[\A#{HOST}\z]o, + ABS_PATH: %r[\A/#{SEG}*+\z]o, + REL_PATH: %r[\A(?!=/)#{SEG}++\z]o, + QUERY: %r[\A(?:%\h\h|[!$&-.0-9:;=@A-Z_a-z~/?])*+\z], + FRAGMENT: %r[\A#{FRAGMENT}\z]o, + OPAQUE: %r[\A(?:[^/].*)?\z], PORT: /\A[\x09\x0a\x0c\x0d ]*\d*[\x09\x0a\x0c\x0d ]*\z/, } end -- cgit v1.2.3