From c7cc9ebe206d8d418462c08d10893cd4186538f5 Mon Sep 17 00:00:00 2001 From: Werner Lemberg Date: Tue, 25 Jun 2013 07:24:02 +0200 Subject: [PATCH] [docmaker] Recognise URLs. * src/tools/docmaker/tohtml.py (re_url): New regular expression. (make_html_para): Use it. --- ChangeLog | 7 ++++++ src/tools/docmaker/tohtml.py | 41 ++++++++++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index 5cda57a46..7425941cc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2013-06-25 Werner Lemberg + + [docmaker] Recognise URLs. + + * src/tools/docmaker/tohtml.py (re_url): New regular expression. + (make_html_para): Use it. + 2013-06-19 Werner Lemberg * Version 2.5.0.1 released. diff --git a/src/tools/docmaker/tohtml.py b/src/tools/docmaker/tohtml.py index 1cbda755b..2ff44d69c 100644 --- a/src/tools/docmaker/tohtml.py +++ b/src/tools/docmaker/tohtml.py @@ -1,11 +1,46 @@ -# ToHTML (c) 2002, 2003, 2005, 2006, 2007, 2008 +# ToHTML (c) 2002, 2003, 2005-2008, 2013 # David Turner from sources import * from content import * from formatter import * -import time +import time, re + + +# this regular expression code to identify an URL has been taken from +# +# http://mail.python.org/pipermail/tutor/2002-September/017228.html +# +# (with slight modifications) + +urls = r'(?:https?|telnet|gopher|file|wais|ftp)' +ltrs = r'\w' +gunk = r'/#~:.?+=&%@!\-' +punc = r'.:?\-' +any = "%(ltrs)s%(gunk)s%(punc)s" % { 'ltrs' : ltrs, + 'gunk' : gunk, + 'punc' : punc } +url = r""" + ( + \b # start at word boundary + %(urls)s : # need resource and a colon + [%(any)s] +? # followed by one or more of any valid + # character, but be conservative and + # take only what you need to... + (?= # [look-ahead non-consumptive assertion] + [%(punc)s]* # either 0 or more punctuation + (?: # [non-grouping parentheses] + [^%(any)s] | $ # followed by a non-url char + # or end of the string + ) + ) + ) + """ % {'urls' : urls, + 'any' : any, + 'punc' : punc } + +re_url = re.compile( url, re.VERBOSE | re.MULTILINE ) # The following defines the HTML header used by all generated pages. @@ -291,6 +326,8 @@ class HtmlFormatter( Formatter ): line = self.make_html_word( words[0] ) for word in words[1:]: line = line + " " + self.make_html_word( word ) + # handle hyperlinks + line = re_url.sub( r'\1', line ) # convert `...' quotations into real left and right single quotes line = re.sub( r"(^|\W)`(.*?)'(\W|$)", \ r'\1‘\2’\3', \