Merge "Add back lost python script."
This commit is contained in:
committed by
Android (Google) Code Review
commit
ea23206c28
160
common/tools/make-iana-tld-pattern.py
Executable file
160
common/tools/make-iana-tld-pattern.py
Executable file
@@ -0,0 +1,160 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from urllib2 import urlopen
|
||||
|
||||
TLD_PREFIX = r"""
|
||||
/**
|
||||
* Regular expression pattern to match all IANA top-level domains.
|
||||
* List accurate as of 2010/02/05. List taken from:
|
||||
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
|
||||
* This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
|
||||
*/
|
||||
public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(
|
||||
"""
|
||||
TLD_SUFFIX = '");'
|
||||
|
||||
URL_PREFIX = r"""
|
||||
/**
|
||||
* Regular expression pattern to match RFC 1738 URLs
|
||||
* List accurate as of 2010/02/05. List taken from:
|
||||
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
|
||||
* This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py
|
||||
*/
|
||||
public static final Pattern WEB_URL = Pattern.compile(
|
||||
"((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
|
||||
+ "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
|
||||
+ "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
|
||||
+ "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host
|
||||
+ "(?:" // plus top level domain
|
||||
"""
|
||||
|
||||
URL_SUFFIX = r"""
|
||||
+ "|(?:(?:25[0-5]|2[0-4]" // or ip address
|
||||
+ "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
|
||||
+ "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
|
||||
+ "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
|
||||
+ "|[1-9][0-9]|[0-9])))"
|
||||
+ "(?:\\:\\d{1,5})?)" // plus option port number
|
||||
+ "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params
|
||||
+ "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
|
||||
+ "(?:\\b|$)"); // and finally, a word boundary or end of
|
||||
// input. This is to stop foo.sure from
|
||||
// matching as foo.su
|
||||
"""
|
||||
|
||||
class Bucket:
|
||||
def __init__(self, baseLetter):
|
||||
self.base=baseLetter
|
||||
self.words=[]
|
||||
self.letters=[]
|
||||
|
||||
def dump(self, isWebUrl=False, isFirst=False, isLast=False):
|
||||
if (len(self.words) == 0) and (len(self.letters) == 0):
|
||||
return ''
|
||||
|
||||
self.words.sort()
|
||||
self.letters.sort()
|
||||
|
||||
output = ' ';
|
||||
|
||||
if isFirst:
|
||||
if isWebUrl:
|
||||
output += '+ "'
|
||||
else:
|
||||
output += '"('
|
||||
else:
|
||||
output += '+ "|'
|
||||
|
||||
if len(self.words) != 0:
|
||||
output += '('
|
||||
|
||||
if isWebUrl:
|
||||
output += '?:'
|
||||
|
||||
firstWord = 1
|
||||
for word in self.words:
|
||||
if firstWord == 0:
|
||||
output += '|'
|
||||
firstWord = 0
|
||||
for letter in word:
|
||||
if letter == '-':
|
||||
output += '\\\\' # escape the '-' character.
|
||||
output += letter
|
||||
|
||||
if len(self.words) > 0 and len(self.letters) > 0:
|
||||
output += '|'
|
||||
|
||||
if len(self.letters) == 1:
|
||||
output += '%c%c' % (self.base, self.letters[0])
|
||||
elif len(self.letters) > 0:
|
||||
output += '%c[' % self.base
|
||||
|
||||
for letter in self.letters:
|
||||
output += letter
|
||||
|
||||
output += ']'
|
||||
|
||||
if len(self.words) != 0:
|
||||
output += ')'
|
||||
|
||||
if not isLast:
|
||||
output += '"'
|
||||
output += '\n'
|
||||
|
||||
return output;
|
||||
|
||||
def add(self, line):
|
||||
length = len(line)
|
||||
|
||||
if line.startswith('#') or (length == 0):
|
||||
return;
|
||||
|
||||
if length == 2:
|
||||
self.letters.append(line[1:2])
|
||||
else:
|
||||
self.words.append(line)
|
||||
|
||||
def getBucket(buckets, line):
|
||||
letter = line[0]
|
||||
bucket = buckets.get(letter)
|
||||
|
||||
if bucket is None:
|
||||
bucket = Bucket(letter)
|
||||
buckets[letter] = bucket
|
||||
|
||||
return bucket
|
||||
|
||||
def makePattern(prefix, suffix, buckets, isWebUrl=False):
|
||||
output = prefix
|
||||
|
||||
output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
|
||||
|
||||
for letter in range(ord('b'), ord('z')):
|
||||
output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
|
||||
|
||||
output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
|
||||
|
||||
if isWebUrl:
|
||||
output += '))"'
|
||||
else:
|
||||
output += ')'
|
||||
|
||||
output += suffix
|
||||
|
||||
print output
|
||||
|
||||
if __name__ == "__main__":
|
||||
f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
|
||||
domains = f.readlines()
|
||||
f.close()
|
||||
|
||||
buckets = {}
|
||||
|
||||
for domain in domains:
|
||||
domain = domain.lower()
|
||||
|
||||
if len(domain) > 0:
|
||||
getBucket(buckets, domain[0]).add(domain.strip())
|
||||
|
||||
makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
|
||||
makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)
|
||||
Reference in New Issue
Block a user