Allegato "newsletter-ml-py2.py"
Scarica 1 #!/usr/bin/python
2 # -*- coding: UTF-8 -*-
3
4 # Copyright (C) 2007 Milo Casagrande <milo@ubuntu.com>
5 # Copyright (C) 2011 Milo Casagrande <milo@ubuntu.com>
6 #
7 # This program is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by the Free
9 # Software Foundation; either version 2 of the License, or (at your option)
10 # any later version
11 #
12 # This program is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 # more details
16 #
17 # You should have received a copy of the GNU General Public License along with
18 # this program; if not, write to the Free Software Foundation, Inc.,
19 # 51 Franklin Street, Fifth Floor, Boston, MA, 02110-1301 USA.
20
21 '''
22 Simple script to convert the Italian newsletter into pure text
23 cleaning all the MoinMoin wiki syntax, in order to send the newsletter
24 via email.
25 '''
26
27 import sys
28 import re
29 import xmlrpclib
30 import codecs
31 import string
32 import os
33
34 req_version = (2,6)
35 cur_version = sys.version_info
36
37 if (cur_version[0] > req_version[0] or
38 (cur_version[0] == req_version[0] and cur_version[1] > cur_version[1])):
39 print u"You need to have Python v.2.6"
40 sys.exit(2)
41 else:
42 from optparse import OptionParser
43
44 alpha = string.letters
45
46 prog_ver = "0.0.4"
47
48 # URL of the Italian wiki
49 wiki_base = "http://wiki.ubuntu-it.org/"
50
51 # URL of the International wiki
52 wiki_world = "http://wiki.ubuntu.com/"
53
54 # Base string for the newsletter
55 new_base = "NewsletterItaliana/"
56
57 line_break = "<<BR>>\n"
58 wiki_category = "CategoryComunita"
59 dict_type = "<type 'dict'>"
60
61 # Default name for the output file
62 default_output = "newsletter-output.txt"
63
64 # Used for the email version
65 comment = "##"
66
67 # The year of the newsletter
68 year = ""
69
70 # The number of the newsletter
71 number = ""
72
73 # XML-RPC after the upgrade to version 1.9 of the Italian wiki
74 # is not working anymore. Keep the function in case we have time
75 # to enable it again
76 def get_newsletter():
77 """
78 Read the newsletter wiki text directly online
79
80 Needs XML-RPC enabled on the wiki.
81 """
82 global new_base, year, number
83
84 wiki = xmlrpclib.ServerProxy(wiki_base+"?action=xmlrpc2")
85
86 pagina = new_base + year + "." + number
87
88 pagedata = wiki.getPage(pagina)
89
90 page_type = str(type(pagedata))
91
92 if page_type == dict_type:
93 print "*** Error: page does not exist."
94 sys.exit(2)
95 else:
96 read_newsletter(pagedata)
97
98 def read_newsletter(options):
99 """
100 Open the input file, create the output file and do the parsing
101 """
102 inputfile = options.inputfile
103
104 if (options.outputfile != default_output):
105 outputfile = os.path.abspath(options.outputfile)
106 else:
107 outputfile = os.path.expanduser("~" + os.sep + options.outputfile)
108
109 try:
110 infile = open(inputfile, 'r')
111 except IOError, e:
112 print "*** Error opening input file %s" % inputfile
113 sys.exit(2)
114
115 try:
116 outfile = open(outputfile, 'w')
117 except IOError, e:
118 print "*** Error opening output file %s" % outputfile
119 sys.exit(2)
120
121 temp = ""
122 towrite = ""
123
124 print "Reading newsletter text from %s..." % inputfile
125 while True:
126 string = infile.readline()
127
128 if string == "": # EOF
129 False
130 break
131
132 towrite = check_string(string)
133
134 if towrite == None:
135 pass
136 else:
137 temp += towrite
138
139 print "Writing output file..."
140
141 outfile.write(temp)
142 infile.close()
143 outfile.close()
144
145 print "Newsletter created in %s." % outputfile
146
147 def check_string(string):
148 u"""Check the string and return it cleaned from all
149 the wiki syntax
150
151 @string: the string to analyze
152 """
153 exp = []
154 nexp = []
155 nnexp = []
156
157 # Remove the ACL string
158 if re.findall('\#acl',string) != []:
159 return None
160 # Remove the format string
161 elif re.findall('\#format', string) != []:
162 return None
163 # Remove the language string
164 elif re.findall('\#LANGUAGE', string) != []:
165 return None
166 # Remove all tables
167 elif re.findall('\|\|\<table', string) != []:
168 return None
169 # Remove all horizontal rules
170 elif re.findall('\-{4,6}', string) != []:
171 return None
172 # Line for e-mail version is kept
173 elif re.findall('##Per la versione in linea', string) != []:
174 string = string.replace(comment, "")
175 # Remove commented lines
176 elif re.findall('^#{2,2}', string) != []:
177 return None
178 # Remove all attachments
179 elif re.findall('attachment', string) != []:
180 return None
181 # Remove all images
182 # TODO should we process the string and keep the link to the image?
183 elif re.findall('<<Immagine\(.*?>>', string) != []:
184 return None
185 # Remove the index macro
186 elif re.findall('<<Indice\(?.*?>>', string) != []:
187 return None
188 # Titles are kept
189 elif re.findall('\={1,3}\s.*?\s\={1,3}', string) != []:
190 return ("\n") + string
191 # Each break/newline is substituted with the real newline
192 elif re.findall('<<BR>>\\n', string) != []:
193 string = string.replace(line_break, "\n")
194 # Remove the category
195 elif re.match(wiki_category, string):
196 return None
197
198 # Remove all single quotes from the string, they have to be at least two
199 exp = re.findall('\'{2,5}.*?\'{2,5}', string)
200
201 if exp != []:
202 for word in exp:
203 nexp.append(word.replace("'", ""))
204
205 for i in range(len(exp)):
206 string = string.replace(exp[i], nexp[i])
207
208 # Remove all back-quotes
209 exp = re.findall('\`{1,2}', string)
210
211 if exp != []:
212 string = string.replace("`", "")
213
214 exp = []
215 nexp = []
216
217 # Remove multiple blank lines
218 exp = re.findall('^$\\n', string)
219
220 if exp:
221 for word in exp:
222 nexp.append(word.replace("\n", ""))
223
224 for i in range(len(exp)):
225 string = string.replace(exp[i], nexp[i])
226
227 # Remove unuseful exclamation marks
228 exp = re.findall('\s\!', string)
229
230 if exp:
231 string = string.replace("!", "")
232
233 exp = []
234 nexp = []
235
236 # Look for all the http links
237 exp = re.findall('\[{2,2}http[s]*\:/{2,2}[|:*\w\S]+\s*\|\s*[\#*\(*\)*\:*,*\{*\}*+*\w\s\d.-]+\]{2,2}', string)
238
239 if exp != []:
240 nnexp = replace_square(exp)
241
242 newstring = ""
243
244 for word in nnexp:
245 splitted = word.split("|")
246 for split in splitted[1:]:
247 newstring += split + " "
248
249 newstring += "( " + splitted[0].strip() + " )"
250 nexp.append(newstring)
251 newstring = ""
252
253 for i in range(len(exp)):
254 string = string.replace(exp[i], nexp[i])
255
256 exp = []
257 nexp = []
258 nnexp = []
259
260 # Look for the wiki links
261 exp = re.findall('\[{2,2}(?!http[s]*\:/{2,2})(?!Ubuntu\:)[\w\S\d]+\s*\|\s*[,*\{*\}*+*\w\s\d.-]+\]{2,2}', string)
262
263 if exp != []:
264 nnexp = replace_square(exp)
265
266 newstring = ""
267
268 for word in nnexp:
269 splitted = word.split("|")
270 for split in splitted[1:]:
271 newstring += split + " "
272
273 newstring += "( " + wiki_base + splitted[0].strip() + " )"
274 nexp.append(newstring)
275 newstring = ""
276
277 for i in range(len(exp)):
278 string = string.replace(exp[i], nexp[i])
279
280 exp = []
281 nexp = []
282 nnexp = []
283
284 # Link to the international wiki
285 exp = re.findall('\[{2,2}(?!http[s]*:/{2,2})Ubuntu\:[\w\S\d]+\s*\|[,*\{*\}*+*\w\s\d.-]+\]{2,2}', string)
286
287 if exp != []:
288 nnexp = replace_square(exp)
289
290 newstring = ""
291
292 for word in nnexp:
293 splitted = word.split("|")
294 for split in splitted[1:]:
295 newstring += split + " "
296
297 # Separate the real name of the page
298 base = splitted[0].split(":")
299
300 newstring += "( " + wiki_world + base[1].strip() + " )"
301 nexp.append(newstring)
302 newstring = ""
303
304 for i in range(len(exp)):
305 string = string.replace(exp[i], nexp[i])
306
307
308 return string
309
310 def replace_square(exp):
311 """
312 Remove the square brackets from the string
313 @exp: the list with the strings to clean
314 """
315 nsq = []
316 nnsq = []
317
318 for word in exp:
319 nsq.append(word.replace("[[", ""))
320
321 for word in nsq:
322 nnsq.append(word.replace("]]", ""))
323
324 return nnsq
325
326 def get_newsletter_number():
327 global year, number
328
329 while True:
330 year = raw_input("Please insert the newsletter year: ")
331
332 for i in year:
333 if i in alpha:
334 print "Bad data!"
335
336 if len(year) > 4 or len(year) < 4:
337 print "The year you typed is wrong."
338 else:
339 break
340
341 while True:
342 number = raw_input("Please insert the newsletter number: ")
343
344 # TODO: check if it is empty?
345
346 for i in number:
347 if i in alpha:
348 print "Bad data!"
349
350 lung = len(number)
351
352 if lung > 3:
353 print "Invalid newsletter number."
354 True
355 continue
356 elif lung == 2:
357 number = "0" + number
358 False
359 break
360 elif lung == 1:
361 number = "00" + number
362 False
363 break
364
365 def define_optionparser():
366 usage = "Usage: %prog [option] arg..."
367 version = "%prog " + prog_ver
368
369 parser = OptionParser(usage=usage, version=version)
370
371 parser.add_option("-i", "--input", metavar="FILE", action="store", type="string", dest="inputfile", help="the input file to read")
372 parser.add_option("-o", "--output", metavar="FILE", action="store", type="string", dest="outputfile", help="the name of the output file; default value is 'newsletter-out.txt' and will be written in the user dir", default=default_output)
373
374 (options, args) = parser.parse_args()
375
376 if (len(sys.argv[1:]) == 0):
377 parser.error("you need to specify the input file.")
378 sys.exit(2)
379 elif (options.inputfile == None):
380 parser.error("you need to specify the input file.")
381 sys.exit(2)
382
383 return options
384
385 def main():
386 # Temporally removed since we cannot download the newsletter directly anymore
387 # get_newsletter_number()
388
389 options = define_optionparser()
390
391 read_newsletter(options)
392
393 if __name__ == "__main__":
394 main()
395 sys.exit(0)
Allegati
Per riferirsi agli allegati di una pagina, usare attachment:NOME_FILE, come mostrato qui sotto nell'elenco degli allegati. NON usare l'URL che si trova in corrispondenza del collegamento [scarica], potrebbe cambiare in futuro.Non รจ consentito inserire allegati su questa pagina.