#!/usr/bin/python # -*- coding: utf-8 -*- # ============================================================================= # Version: 1.1 (Mar 31, 2009) # Author: Antonio Fuschetto (fuschett@di.unipi.it), University of Pisa # ============================================================================= # ============================================================================= # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License, version 3, # as published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # ============================================================================= """Wikipedia Extractor: Extracts and cleans text from Wikipedia database dump and stores each article in file named as article title in AURORA code. Taken out from Tanl software package. Usage: WikiExtractor.py [options] Options: -o ..., --output=... : place output files in specified directory (current directory by default) --help : display this help and exit --usage : display script usage """ import sys import getopt import re import os.path ### SUPPORT CLASSES ########################################################### class WikiDocument: def __init__(self): self.id = None self.url = None self.text = None self.title = None def __str__(self): return '%s\n' % (self.text) #------------------------------------------------------------------------------ class WikiExtractor: __garbage_tags = ('ref', 'gallery', 'timeline', 'noinclude', 'pre', 'table', 'tr', 'td', 'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir') __wrapper_tags = ('nowiki', 'cite', 'source', 'hiero', 'div', 'font', 'span', 'strong', 'strike', 'blockquote', 'tt', 'var', 'sup', 'sub', 'big', 'small', 'center', 'h1', 'h2', 'h3', 'em', 'b', 'i', 'u', 'a', 's', 'p') __single_tags = ('references', 'ref', 'img', 'br', 'hr', 'li', 'dt', 'dd') __placeholder_tags = {'math':'Formula', 'code':'Codice'} __project_namespaces = ('wikipedia', 'mediawiki', 'wikiquote', 'wikibooks', 'wikisource', 'wiktionary', 'wikispecies', 'wikinews', 'wikiversita', 'commons') __garbage_namespaces = ('immagine', 'image', 'categoria', 'category') __char_entities = {' ' :u'\u00A0', '¡' :u'\u00A1', '¢' :u'\u00A2', '£' :u'\u00A3', '¤':u'\u00A4', '¥' :u'\u00A5', '¦' :u'\u00A6', '§' :u'\u00A7', '¨' :u'\u00A8', '©' :u'\u00A9', 'ª' :u'\u00AA', '«' :u'\u00AB', '¬' :u'\u00AC', '­' :u'\u00AD', '®' :u'\u00AE', '¯' :u'\u00AF', '°' :u'\u00B0', '±' :u'\u00B1', '²' :u'\u00B2', '³' :u'\u00B3', '´' :u'\u00B4', 'µ' :u'\u00B5', '¶' :u'\u00B6', '·' :u'\u00B7', '¸' :u'\u00B8', '¹' :u'\u00B9', 'º' :u'\u00BA', '»' :u'\u00BB', '¼':u'\u00BC', '½' :u'\u00BD', '¾' :u'\u00BE', '¿':u'\u00BF', 'À' :u'\u00C0', 'Á' :u'\u00C1', 'Â' :u'\u00C2', 'Ã' :u'\u00C3', 'Ä' :u'\u00C4', 'Å' :u'\u00C5', 'Æ' :u'\u00C6', 'Ç' :u'\u00C7', 'È':u'\u00C8', 'É' :u'\u00C9', 'Ê' :u'\u00CA', 'Ë' :u'\u00CB', 'Ì' :u'\u00CC', 'Í' :u'\u00CD', 'Î' :u'\u00CE', 'Ï' :u'\u00CF', 'Ð' :u'\u00D0', 'Ñ':u'\u00D1', 'Ò' :u'\u00D2', 'Ó' :u'\u00D3', 'Ô' :u'\u00D4', 'Õ' :u'\u00D5', 'Ö' :u'\u00D6', '×' :u'\u00D7', 'Ø' :u'\u00D8', 'Ù' :u'\u00D9', 'Ú':u'\u00DA', 'Û' :u'\u00DB', 'Ü' :u'\u00DC', 'Ý':u'\u00DD', 'Þ' :u'\u00DE', 'ß' :u'\u00DF', 'à':u'\u00E0', 'á' :u'\u00E1', 'â' :u'\u00E2', 'ã':u'\u00E3', 'ä' :u'\u00E4', 'å' :u'\u00E5', 'æ' :u'\u00E6', 'ç' :u'\u00E7', 'è' :u'\u00E8', 'é':u'\u00E9', 'ê' :u'\u00EA', 'ë' :u'\u00EB', 'ì':u'\u00EC', 'í' :u'\u00ED', 'î' :u'\u00EE', 'ï' :u'\u00EF', 'ð' :u'\u00F0', 'ñ' :u'\u00F1', 'ò':u'\u00F2', 'ó' :u'\u00F3', 'ô' :u'\u00F4', 'õ':u'\u00F5', 'ö' :u'\u00F6', '÷' :u'\u00F7', 'ø':u'\u00F8', 'ù' :u'\u00F9', 'ú' :u'\u00FA', 'û' :u'\u00FB', 'ü' :u'\u00FC', 'ý' :u'\u00FD', 'þ' :u'\u00FE', 'ÿ' :u'\u00FF', 'ƒ' :u'\u0192', 'Α' :u'\u0391', 'Β' :u'\u0392', 'Γ' :u'\u0393', 'Δ' :u'\u0394', 'Ε' :u'\u0395', 'Ζ' :u'\u0396', 'Η' :u'\u0397', 'Θ' :u'\u0398', 'Ι' :u'\u0399', 'Κ' :u'\u039A', 'Λ' :u'\u039B', 'Μ' :u'\u039C', 'Ν' :u'\u039D', 'Ξ' :u'\u039E', 'Ο':u'\u039F', 'Π' :u'\u03A0', 'Ρ' :u'\u03A1', 'Σ' :u'\u03A3', 'Τ' :u'\u03A4', 'Υ' :u'\u03A5', 'Φ' :u'\u03A6', 'Χ' :u'\u03A7', 'Ψ' :u'\u03A8', 'Ω' :u'\u03A9', 'α' :u'\u03B1', 'β' :u'\u03B2', 'γ' :u'\u03B3', 'δ' :u'\u03B4', 'ε' :u'\u03B5', 'ζ' :u'\u03B6', 'η' :u'\u03B7', 'θ' :u'\u03B8', 'ι' :u'\u03B9', 'κ' :u'\u03BA', 'λ' :u'\u03BB', 'μ' :u'\u03BC', 'ν' :u'\u03BD', 'ξ' :u'\u03BE', 'ο':u'\u03BF', 'π' :u'\u03C0', 'ρ' :u'\u03C1', 'ς' :u'\u03C2', 'σ' :u'\u03C3', 'τ' :u'\u03C4', 'υ':u'\u03C5', 'φ' :u'\u03C6', 'χ' :u'\u03C7', 'ψ' :u'\u03C8', 'ω' :u'\u03C9', 'ϑ':u'\u03D1', 'ϒ' :u'\u03D2', 'ϖ' :u'\u03D6', '•' :u'\u2022', '…' :u'\u2026', '′' :u'\u2032', '″' :u'\u2033', '‾' :u'\u203E', '⁄' :u'\u2044', '℘' :u'\u2118', 'ℑ' :u'\u2111', 'ℜ' :u'\u211C', '™' :u'\u2122', 'ℵ':u'\u2135', '←' :u'\u2190', '↑' :u'\u2191', '→' :u'\u2192', '↓' :u'\u2193', '↔' :u'\u2194', '↵' :u'\u21B5', '⇐' :u'\u21D0', '⇑' :u'\u21D1', '⇒' :u'\u21D2', '⇓' :u'\u21D3', '⇔' :u'\u21D4', '∀' :u'\u2200', '∂' :u'\u2202', '∃' :u'\u2203', '∅' :u'\u2205', '∇' :u'\u2207', '∈' :u'\u2208', '∉' :u'\u2209', '∋' :u'\u220B', '∏' :u'\u220F', '∑' :u'\u2211', '−' :u'\u2212', '∗' :u'\u2217', '√' :u'\u221A', '∝' :u'\u221D', '∞' :u'\u221E', '∠' :u'\u2220', '∧' :u'\u2227', '∨' :u'\u2228', '∩' :u'\u2229', '∪' :u'\u222A', '∫' :u'\u222B', '∴' :u'\u2234', '∼' :u'\u223C', '≅' :u'\u2245', '≈' :u'\u2248', '≠' :u'\u2260', '≡' :u'\u2261', '≤' :u'\u2264', '≥' :u'\u2265', '⊂' :u'\u2282', '⊃' :u'\u2283', '⊄' :u'\u2284', '⊆' :u'\u2286', '⊇' :u'\u2287', '⊕' :u'\u2295', '⊗' :u'\u2297', '⊥' :u'\u22A5', '⋅' :u'\u22C5', '⌈' :u'\u2308', '⌉' :u'\u2309', '⌊':u'\u230A', '⌋' :u'\u230B', '⟨' :u'\u2329', '⟩' :u'\u232A', '◊' :u'\u25CA', '♠' :u'\u2660', '♣' :u'\u2663', '♥' :u'\u2665', '♦' :u'\u2666', '"' :u'\u0022', '<' :u'\u003C', '>' :u'\u003E', 'Œ' :u'\u0152', 'œ' :u'\u0153', 'Š' :u'\u0160', 'š':u'\u0161', 'Ÿ' :u'\u0178', 'ˆ' :u'\u02C6', '˜' :u'\u02DC', ' ' :u'\u2002', ' ' :u'\u2003', ' ':u'\u2009', '‌' :u'\u200C', '‍' :u'\u200D', '‎' :u'\u200E', '‏' :u'\u200F', '–' :u'\u2013', '—' :u'\u2014', '‘' :u'\u2018', '’' :u'\u2019', '‚' :u'\u201A', '“' :u'\u201C', '”' :u'\u201D', '„' :u'\u201E', '†' :u'\u2020', '‡' :u'\u2021', '‰':u'\u2030', '‹' :u'\u2039', '›' :u'\u203A', '€' :u'\u20AC'} def __init__(self): # Riconosce i commenti HTML self.__comment_pattern = re.compile(r'', re.DOTALL) # Riconosce i tag HTML spazzatura self.__garbage_tag_patterns = list() for tag in self.__class__.__garbage_tags: pattern = re.compile(r'<\s*%s(\s*| [^/]+?)>.*?<\s*/\s*%s\s*>' % (tag, tag), re.DOTALL | re.IGNORECASE) self.__garbage_tag_patterns.append(pattern) # Riconosce i tag HTML contenitori self.__wrapper_tag_patterns = list() for tag in self.__class__.__wrapper_tags: left_pattern = re.compile(r'<\s*%s(\s*| [^/]+?)>' % tag, re.DOTALL | re.IGNORECASE) right_pattern = re.compile(r'<\s*/\s*%s\s*>' % tag, re.DOTALL | re.IGNORECASE) self.__wrapper_tag_patterns.append((left_pattern, right_pattern)) # Riconosce i tag HTML singoli self.__single_tag_patterns = list() for tag in self.__class__.__single_tags: good_pattern = re.compile(r'<\s*%s(\s*| .+?)/\s*>' % tag, re.DOTALL | re.IGNORECASE) bad_pattern = re.compile(r'<\s*(/|\\)?\s*%s(\s*| [^/]+?)\\?\s*>' % tag, re.DOTALL | re.IGNORECASE) self.__single_tag_patterns.append((good_pattern, bad_pattern)) # Riconosce i tag HTML segnaposto self.__placeholder_tag_patterns = list() for tag in self.__class__.__placeholder_tags.iterkeys(): pattern = re.compile(r'<\s*%s(\s*| [^/]+?)>.*?<\s*/\s*%s\s*>' % (tag, tag), re.DOTALL | re.IGNORECASE) self.__placeholder_tag_patterns.append((pattern, self.__class__.__placeholder_tags[tag])) # Riconosce le tabelle e i template self.__table_pattern = re.compile(r'\{[^{]*?\}', re.DOTALL) # Riconosce i wikilink good_wikilink_pattern = re.compile(r'\[\[[^[]*?\]\]', re.DOTALL) bad_left_wikilink_pattern = re.compile(r'\[[^[]*?\]\]', re.DOTALL) bad_right_wikilink_pattern = re.compile(r'\[\[[^[]*?\]', re.DOTALL) self.__wikilink_pattern = (good_wikilink_pattern, bad_left_wikilink_pattern, bad_right_wikilink_pattern) # Riconosce i link HTTP self.__http_link_pattern = re.compile(r'\[http.*?\]', re.DOTALL | re.IGNORECASE) # Riconosce gli apostrofi che precedono grassetto e corsivo apostrophe_bold_pattern = re.compile(r"\w'('''.*?''')", re.DOTALL) apostrophe_italic_pattern = re.compile(r"\w'(''.*?'')", re.DOTALL) self.__apostrophe_pattern = (apostrophe_bold_pattern, apostrophe_italic_pattern) # Riconosce le entita' numeriche self.__numeric_entity_pattern = re.compile(r'&#\d+?;') # Riconosce gli spazi multipli self.__multi_space_pattern = re.compile(r' {2,}') # Riconosce i punti multipli self.__multi_dot_pattern = re.compile(r'\.{4,}') def extract(self, wiki_document): wiki_document = self.__clean(wiki_document) if not wiki_document: return None wiki_document = self.__compact(wiki_document) return wiki_document def __clean(self, wiki_document): # Rende maggiormente riconoscibili i tag wiki_document.text = wiki_document.text.replace('<', '<').replace('>', '>') wiki_document.text = wiki_document.text.replace('<<', u'«').replace('>>', u'»') # Elimina i commenti HTML wiki_document.text = self.__comment_pattern.sub('', wiki_document.text) # Elimina i tag HTML spazzatura for pattern in self.__garbage_tag_patterns: wiki_document.text = pattern.sub('', wiki_document.text) # Elimina i tag HTML contenitori for left_pattern, right_pattern in self.__wrapper_tag_patterns: wiki_document.text = left_pattern.sub('', wiki_document.text) wiki_document.text = right_pattern.sub('', wiki_document.text) # Elimina i tag HTML singoli for good_pattern, bad_pattern in self.__single_tag_patterns: wiki_document.text = good_pattern.sub('', wiki_document.text) wiki_document.text = bad_pattern.sub('', wiki_document.text) # Elimina i tag HTML segnaposto for pattern, placeholder in self.__placeholder_tag_patterns: index = 1 for match in pattern.finditer(wiki_document.text): wiki_document.text = wiki_document.text.replace(match.group(), '[%s %d]' % (placeholder, index)) index += 1 # Elimina le tabelle e i template wiki_document.text = wiki_document.text.replace('{{end box}}', '}') wiki_document.text = wiki_document.text.replace('{{', '{').replace('}}', '}') wiki_document.text = wiki_document.text.replace('{|', '{').replace('|}', '}') wiki_document.text = self.__table_pattern.sub('', wiki_document.text) wiki_document.text = self.__table_pattern.sub('', wiki_document.text) wiki_document.text = self.__table_pattern.sub('', wiki_document.text) # Gestisce i wikilink (ben formattati) good_wikilink_pattern = self.__wikilink_pattern[0] for match in good_wikilink_pattern.finditer(wiki_document.text): wikilink = match.group() wiki_document.text = wiki_document.text.replace(wikilink, self.__handle_wikilink(wikilink[2:-2])) for match in good_wikilink_pattern.finditer(wiki_document.text): wikilink = match.group() wiki_document.text = wiki_document.text.replace(wikilink, self.__handle_wikilink(wikilink[2:-2])) # Gestisce i wikilink (mal formattatia) bad_left_wikilink_pattern = self.__wikilink_pattern[1] for match in bad_left_wikilink_pattern.finditer(wiki_document.text): wikilink = match.group() wiki_document.text = wiki_document.text.replace(wikilink, self.__handle_wikilink(wikilink[1:-2])) bad_right_wikilink_pattern = self.__wikilink_pattern[2] for match in bad_right_wikilink_pattern.finditer(wiki_document.text): wikilink = match.group() wiki_document.text = wiki_document.text.replace(wikilink, self.__handle_wikilink(wikilink[2:-1])) wiki_document.text = wiki_document.text.replace('[[', '').replace(']]', '') # Elimina i link HTTP wiki_document.text = self.__http_link_pattern.sub('', wiki_document.text).replace('[]', '') # Gestisce i grassetti e i corsivi apostrophe_bold_pattern = self.__apostrophe_pattern[0] for match in apostrophe_bold_pattern.finditer(wiki_document.text): bold_text = match.group(1) wiki_document.text = wiki_document.text.replace(bold_text, bold_text[3:-3]) apostrophe_italic_pattern = self.__apostrophe_pattern[1] for match in apostrophe_italic_pattern.finditer(wiki_document.text): italic_text = match.group(1) wiki_document.text = wiki_document.text.replace(italic_text, '"%s"' % italic_text[2:-2]) wiki_document.text = wiki_document.text.replace("'''", '').replace("''", '"') # Gestisce i caratteri speciali wiki_document.text = wiki_document.text.replace('&', '&').replace('""', '"') for entity in self.__class__.__char_entities.iterkeys(): wiki_document.text = wiki_document.text.replace(entity, self.__class__.__char_entities[entity]) # Gestisce i caratteri speciali for match in self.__numeric_entity_pattern.finditer(wiki_document.text): entity = match.group() wiki_document.text = wiki_document.text.replace(entity, self.__handle_unicode(entity)) # Gestisce alcune imperfezioni del testo wiki_document.text = wiki_document.text.replace('\t', ' ') wiki_document.text = self.__multi_space_pattern.sub(' ', wiki_document.text) wiki_document.text = self.__multi_dot_pattern.sub('...', wiki_document.text) wiki_document.text = wiki_document.text.replace(' ,', ',').replace(' .', '.') wiki_document.text = wiki_document.text.replace(' :', ':').replace(' ;', ';') wiki_document.text = wiki_document.text.replace(',,', ',').replace(',.', '.') wiki_document.text = wiki_document.text.replace('( ', '(').replace(' )', ')') wiki_document.text = wiki_document.text.replace('[ ', '[').replace(' ]', ']') wiki_document.text = wiki_document.text.replace(u'« ', u'«').replace(u' »', u'»') return wiki_document def __compact(self, wiki_document): page = list() paragraph = list() for line in wiki_document.text.split('\n'): line = line.strip() if not line: continue # Gestisce il titolo della pagina if line.startswith('++'): title = line[2:-2] if title and title[-1] not in '!?': title = '%s.' % title page = [title] # Gestisce i titoli dei paragrafi elif line.startswith('=='): if len(paragraph) > 1: page.extend(paragraph) title = line[2:-2] if title and title[-1] not in '!?': title = '%s.' % title paragraph = [title] # Elimina gli elenchi puntati e numerati elif line[0] in '*#:;': continue # Elimina i resti delle tabelle elif line[0] in '{|' or line[-1] in '}': continue # Elimina le righe non significative elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '': continue # Elimina le righe con un basso numero di token elif len(line.split()) < 6: continue # Gestisce il testo della pagina elif len(paragraph) == 0: page.append(line) # Gestisce il testo dei paragrafi else: paragraph.append(line) if len(paragraph) > 1: page.extend(paragraph) elif len(page) == 1: return None wiki_document.text = '\n'.join(page) return wiki_document def __handle_wikilink(self, wikilink): tokens = wikilink.split(':') while not tokens[0]: if len(tokens) < 2: return '' tokens = tokens[1:] if len(tokens) == 1 or tokens[0].strip().lower() in self.__class__.__project_namespaces: tokens = tokens[-1].split('|') while not tokens[-1]: if len(tokens) < 2: return '' tokens = tokens[:-1] return tokens[-1].split('#')[-1].split('/')[-1].strip() if tokens[0].strip().lower() in self.__class__.__garbage_namespaces: return '' tokens = tokens[-1].split('|') while not tokens[-1]: if len(tokens) < 2: return '' tokens = tokens[:-1] if len(tokens) == 1: return '' return tokens[-1].split('#')[-1].split('/')[-1].strip() def __handle_unicode(self, entity): numeric_code = int(entity[2:-1]) if numeric_code >= 0x10000: return '' return unichr(numeric_code) ### CORE ###################################################################### def process_data(input_file, wiki_extractor, output_dir_name): page = [] for line in input_file: line = line.decode('utf-8').strip() if line == '': page = [] elif line == '': process_page(page, wiki_extractor, output_dir_name) else: page.append(line) #------------------------------------------------------------------------------ # Unicode --> AURORA __trans = {u'а':'a', u'б':'b', u'в':'v', u'г':'g', u'д':'d', u'ђ':'dx', u'е':'e', u'ж':'zx', u'з':'z', u'и':'i', u'ј':'j', u'к':'k', u'л':'l', u'љ':'lx', u'м':'m', u'н':'n', u'њ':'nx', u'о':'o', u'п':'p', u'р':'r', u'с':'s', u'т':'t', u'ћ':'cx', u'ч':'cy', u'ф':'f', u'х':'h', u'ц':'c', u'у':'u', u'џ':'dy', u'ш':'sx', u'А':'A', u'Б':'B', u'В':'V', u'Г':'G', u'Д':'D', u'Ђ':'Dx', u'Е':'E', u'Ж':'Zx', u'З':'Z', u'И':'I', u'Ј':'J', u'К':'K', u'Л':'L', u'Љ':'Lx', u'М':'M', u'Н':'N', u'Њ':'Nx', u'О':'O', u'П':'P', u'Р':'R', u'С':'S', u'Т':'T', u'Ћ':'Cx', u'Ч':'Cy', u'Ф':'F', u'Х':'H', u'Ц':'C', u'У':'U', u'Џ':'Dy', u'Ш':'Sx'}; def translate_article_name(name): newname = '' name = name.replace(' ','_') for n in name: if n in __trans: newname += __trans[n] elif n.isalnum(): newname += n else: newname += '_' return newname num_of_unknown = 0 def process_page(page, wiki_extractor, output_dir_name): wiki_document = extract_document(page) if not wiki_document: return wiki_document = wiki_extractor.extract(wiki_document) if not wiki_document: return title = translate_article_name(wiki_document.title) if not title: title = "%d_Unknown" % num_of_unknown num_of_unknown += 1 f = open(os.path.join(output_dir_name, "%s.txt" % title), 'w+') f.write(wiki_document.__str__().encode('utf-8')) f.close() #------------------------------------------------------------------------------ def extract_document(page): wiki_document = WikiDocument() for line in page: if not line: continue # Identificatore della pagina (nodo XML) if not wiki_document.id and line.startswith('') and line.endswith(''): wiki_document.id = int(line[4:-5]) continue # Titolo della pagina (nodo XML) elif not wiki_document.url and line.startswith('') and line.endswith(''): title = line[7:-8].replace('&', '&') if ':' in title: return None wiki_document.text = '++%s++' % title wiki_document.title = title continue # Inizio del testo della pagina (nodo XML) elif line.startswith(''): return None line = line[27:] if not line: continue # Fine del testo della pagina (nodo XML) elif line.endswith(''): line = line[:-7] if not line: continue # Informazione superflua (nodo XML) elif line[0] == '<': continue # Titolo di paragafo (testo della pagina) elif line[0] == '=': line = '==%s==' % line.strip('= ') wiki_document.text += '\n%s' % line return wiki_document ### USER INTERFACE ############################################################ def show_help(): print >> sys.stdout, __doc__, def show_usage(output_file, script_name): print >> output_file, 'Usage: %s [options]' % script_name def show_suggestion(output_file, script_name): print >> output_file, 'Try \'%s --help\' for more information.' % script_name def show_file_error(script_name, file_name): print >> sys.stderr, '%s: %s: No such file or directory' % (script_name, file_name) def main(): script_name = os.path.basename(sys.argv[0]) try: long_opts = ['help', 'usage', 'compress', 'bytes=', 'output='] opts, args = getopt.gnu_getopt(sys.argv[1:], 'cb:o:', long_opts) except getopt.GetoptError: show_usage(sys.stderr, script_name) show_suggestion(sys.stderr, script_name) sys.exit(1) output_dir_name = '.' for opt, arg in opts: if opt == '--help': show_help() sys.exit() elif opt == '--usage': show_usage(sys.stdout, script_name) sys.exit() elif opt in ('-o', '--output'): if os.path.isdir(arg): output_dir_name = arg else: show_file_error(script_name, arg) sys.exit(3) if len(args) > 0: show_usage(sys.stderr, script_name) show_suggestion(sys.stderr, script_name) sys.exit(4) wiki_extractor = WikiExtractor() process_data(sys.stdin, wiki_extractor, output_dir_name) if __name__ == '__main__': main()