# -*- coding:utf-8 -*- # # Copyright (C) 2015 Wildfire Games # All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted provided that the # following conditions are met: # # Redistributions of source code must retain the above copyright notice, this list of conditions and the following # disclaimer. # Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided with the distribution. # The name of the author may not be used to endorse or promote products derived from this software without specific # prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, # THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE # AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR # OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from __future__ import absolute_import, division, print_function, unicode_literals import codecs, re, os, sys import json as jsonParser from tokenize import generate_tokens, COMMENT, NAME, OP, STRING from textwrap import dedent def pathmatch(mask, path): """ Matches paths to a mask, where the mask supports * and **. Paths use / as the separator * matches a sequence of characters without /. ** matches a sequence of characters without / followed by a / and sequence of characters without / :return: true iff path matches the mask, false otherwise """ s = re.split(r"([*][*]?)", mask) p = "" for i in xrange(len(s)): if i % 2 != 0: p = p + "[^/]+" if len(s[i]) == 2: p = p + "(/[^/]+)*" else: p = p + re.escape(s[i]) p = p + "$" return re.match(p, path) != None class Extractor(object): def __init__(self, directoryPath, filemasks, options): self.directoryPath = directoryPath self.options = options if isinstance(filemasks, dict): self.includeMasks = filemasks["includeMasks"] self.excludeMasks = filemasks["excludeMasks"] else: self.includeMasks = filemasks self.excludeMasks = [] def run(self): """ Extracts messages. :return: An iterator over ``(message, plural, context, (location, pos), comment)`` tuples. :rtype: ``iterator`` """ directoryAbsolutePath = os.path.abspath(self.directoryPath) for root, folders, filenames in os.walk(directoryAbsolutePath): for subdir in folders: if subdir.startswith('.') or subdir.startswith('_'): folders.remove(subdir) folders.sort() filenames.sort() for filename in filenames: filename = os.path.relpath(os.path.join(root, filename), self.directoryPath).replace(os.sep, '/') for filemask in self.excludeMasks: if pathmatch(filemask, filename): break else: for filemask in self.includeMasks: if pathmatch(filemask, filename): filepath = os.path.join(directoryAbsolutePath, filename) for message, plural, context, breadcrumb, position, comments in self.extractFromFile(filepath): # Replace spaces in filenames by non-breaking spaces so that word # wrapping in po files does not split up our paths yield message, plural, context, (filename.replace(' ', u"\xa0") + (":"+breadcrumb if breadcrumb else ""), position), comments def extractFromFile(self, filepath): """ Extracts messages from a specific file. :return: An iterator over ``(message, plural, context, position, comments)`` tuples. :rtype: ``iterator`` """ pass class javascript(Extractor): """ Extract messages from JavaScript source code. """ empty_msgid_warning = ( '%s: warning: Empty msgid. It is reserved by GNU gettext: gettext("") ' 'returns the header entry with meta information, not the empty string.' ) def extractJavascriptFromFile(self, fileObject): from extractors.jslexer import tokenize, unquote_string funcname = message_lineno = None messages = [] last_argument = None translator_comments = [] concatenate_next = False last_token = None call_stack = -1 comment_tags = self.options.get('commentTags', []) keywords = self.options.get('keywords', {}).keys() for token in tokenize(fileObject.read()): if token.type == 'operator' and token.value == '(': if funcname: message_lineno = token.lineno call_stack += 1 elif call_stack == -1 and token.type == 'linecomment': value = token.value[2:].strip() if translator_comments and \ translator_comments[-1][0] == token.lineno - 1: translator_comments.append((token.lineno, value)) continue for comment_tag in comment_tags: if value.startswith(comment_tag): translator_comments.append((token.lineno, value.strip())) break elif token.type == 'multilinecomment': # only one multi-line comment may preceed a translation translator_comments = [] value = token.value[2:-2].strip() for comment_tag in comment_tags: if value.startswith(comment_tag): lines = value.splitlines() if lines: lines[0] = lines[0].strip() lines[1:] = dedent('\n'.join(lines[1:])).splitlines() for offset, line in enumerate(lines): translator_comments.append((token.lineno + offset, line)) break elif funcname and call_stack == 0: if token.type == 'operator' and token.value == ')': if last_argument is not None: messages.append(last_argument) if len(messages) > 1: messages = tuple(messages) elif messages: messages = messages[0] else: messages = None # Comments don't apply unless they immediately precede the # message if translator_comments and \ translator_comments[-1][0] < message_lineno - 1: translator_comments = [] if messages is not None: yield (message_lineno, funcname, messages, [comment[1] for comment in translator_comments]) funcname = message_lineno = last_argument = None concatenate_next = False translator_comments = [] messages = [] call_stack = -1 elif token.type == 'string': new_value = unquote_string(token.value) if concatenate_next: last_argument = (last_argument or '') + new_value concatenate_next = False else: last_argument = new_value elif token.type == 'operator': if token.value == ',': if last_argument is not None: messages.append(last_argument) last_argument = None else: messages.append(None) concatenate_next = False elif token.value == '+': concatenate_next = True elif call_stack > 0 and token.type == 'operator' \ and token.value == ')': call_stack -= 1 elif funcname and call_stack == -1: funcname = None elif call_stack == -1 and token.type == 'name' and \ token.value in keywords and \ (last_token is None or last_token.type != 'name' or last_token.value != 'function'): funcname = token.value last_token = token def extractFromFile(self, filepath): with codecs.open(filepath, 'r', encoding='utf-8-sig') as fileObject: for lineno, funcname, messages, comments in self.extractJavascriptFromFile(fileObject): if funcname: spec = self.options.get('keywords', {})[funcname] or (1,) else: spec = (1,) if not isinstance(messages, (list, tuple)): messages = [messages] if not messages: continue # Validate the messages against the keyword's specification context = None msgs = [] invalid = False # last_index is 1 based like the keyword spec last_index = len(messages) for index in spec: if isinstance(index, (list, tuple)): context = messages[index[0] - 1] continue if last_index < index: # Not enough arguments invalid = True break message = messages[index - 1] if message is None: invalid = True break msgs.append(message) if invalid: continue # keyword spec indexes are 1 based, therefore '-1' if isinstance(spec[0], (tuple, list)): # context-aware *gettext method first_msg_index = spec[1] - 1 else: first_msg_index = spec[0] - 1 if not messages[first_msg_index]: # An empty string msgid isn't valid, emit a warning where = '%s:%i' % (hasattr(fileObject, 'name') and \ fileObject.name or '(unknown)', lineno) print(self.empty_msgid_warning % where, file=sys.stderr) continue messages = tuple(msgs) message = messages[0] plural = None if len(messages) == 2: plural = messages[1] yield message, plural, context, None, lineno, comments class cpp(javascript): """ Extract messages from C++ source code. """ pass class txt(Extractor): """ Extract messages from plain text files. """ def extractFromFile(self, filepath): with codecs.open(filepath, "r", encoding='utf-8-sig') as fileObject: lineCount = 0 for line in [line.strip("\n\r") for line in fileObject.readlines()]: lineCount += 1 if line: yield line, None, None, None, lineCount, [] class json(Extractor): """ Extract messages from JSON files. """ def __init__(self, directoryPath=None, filemasks=[], options={}): super(json, self).__init__(directoryPath, filemasks, options) self.breadcrumbs = [] self.keywords = self.options.get("keywords", {}) self.context = self.options.get("context", None) def setOptions(self, options): self.options = options self.keywords = self.options.get("keywords", {}) self.context = self.options.get("context", None) @staticmethod def formatBreadcrumbs(breadcrumbs): firstPiece = breadcrumbs[0] if isinstance(firstPiece, int): outputString = "[" + str(firstPiece) + "]" else: outputString = firstPiece for piece in breadcrumbs[1:]: if isinstance(piece, int): outputString += "[" + str(piece) + "]" else: outputString += "." + piece return outputString def extractFromFile(self, filepath): with codecs.open(filepath, "r", 'utf-8') as fileObject: for message, breadcrumbs in self.extractFromString(fileObject.read()): yield message, None, self.context, self.formatBreadcrumbs(breadcrumbs), -1, [] def extractFromString(self, string): self.breadcrumbs = [] jsonDocument = jsonParser.loads(string) if isinstance(jsonDocument, list): for message, breadcrumbs in self.parseList(jsonDocument): if message: # Skip empty strings. yield message, breadcrumbs elif isinstance(jsonDocument, dict): for message, breadcrumbs in self.parseDictionary(jsonDocument): if message: # Skip empty strings. yield message, breadcrumbs else: raise Exception("Unexpected JSON document parent structure (not a list or a dictionary). You must extend the JSON extractor to support it.") def parseList(self, itemsList): index = 0 for listItem in itemsList: self.breadcrumbs.append(index) if isinstance(listItem, list): for message, breadcrumbs in self.parseList(listItem): yield message, breadcrumbs elif isinstance(listItem, dict): for message, breadcrumbs in self.parseDictionary(listItem): yield message, breadcrumbs del self.breadcrumbs[-1] index += 1 def parseDictionary(self, dictionary): for keyword in dictionary: self.breadcrumbs.append(keyword) if keyword in self.keywords: if isinstance(dictionary[keyword], unicode): yield dictionary[keyword], self.breadcrumbs elif isinstance(dictionary[keyword], list): for message, breadcrumbs in self.extractList(dictionary[keyword]): yield message, breadcrumbs elif isinstance(dictionary[keyword], dict): for message, breadcrumbs in self.extractDictionary(dictionary[keyword]): yield message, breadcrumbs elif isinstance(dictionary[keyword], list): for message, breadcrumbs in self.parseList(dictionary[keyword]): yield message, breadcrumbs elif isinstance(dictionary[keyword], dict): for message, breadcrumbs in self.parseDictionary(dictionary[keyword]): yield message, breadcrumbs del self.breadcrumbs[-1] def extractList(self, itemsList): index = 0 for listItem in itemsList: self.breadcrumbs.append(index) if isinstance(listItem, unicode): yield listItem, self.breadcrumbs del self.breadcrumbs[-1] index += 1 def extractDictionary(self, dictionary): for keyword in dictionary: self.breadcrumbs.append(keyword) if isinstance(dictionary[keyword], unicode): yield dictionary[keyword], self.breadcrumbs del self.breadcrumbs[-1] class xml(Extractor): """ Extract messages from XML files. """ def __init__(self, directoryPath, filemasks, options): super(xml, self).__init__(directoryPath, filemasks, options) self.keywords = self.options.get("keywords", {}) self.jsonExtractor = None def getJsonExtractor(self): if not self.jsonExtractor: self.jsonExtractor = json() return self.jsonExtractor def extractFromFile(self, filepath): from lxml import etree with codecs.open(filepath, "r", encoding='utf-8-sig') as fileObject: xmlDocument = etree.parse(fileObject) for keyword in self.keywords: for element in xmlDocument.iter(keyword): position = element.sourceline if element.text is not None: context = None comments = [] if "extractJson" in self.keywords[keyword]: jsonExtractor = self.getJsonExtractor() jsonExtractor.setOptions(self.keywords[keyword]["extractJson"]) for message, breadcrumbs in jsonExtractor.extractFromString(element.text): yield message, None, context, json.formatBreadcrumbs(breadcrumbs), position, comments else: breadcrumb = None if "locationAttributes" in self.keywords[keyword]: attributes = [element.get(attribute) for attribute in self.keywords[keyword]["locationAttributes"] if attribute in element.attrib] breadcrumb = "({attributes})".format(attributes=", ".join(attributes)) if "tagAsContext" in self.keywords[keyword]: context = keyword if "context" in element.attrib: context = unicode(element.get("context")) if "comment" in element.attrib: comment = element.get("comment") comment = u" ".join(comment.split()) # Remove tabs, line breaks and unecessary spaces. comments.append(comment) if "splitOnWhitespace" in self.keywords[keyword]: for splitText in element.text.split(): # split on whitespace is used for token lists, there, a leading '-' means the token has to be removed, so it's not to be processed here either if splitText[0] != "-": yield unicode(splitText), None, context, breadcrumb, position, comments else: yield unicode(element.text), None, context, breadcrumb, position, comments # Hack from http://stackoverflow.com/a/2819788 class FakeSectionHeader(object): def __init__(self, fp): self.fp = fp self.sechead = '[root]\n' def readline(self): if self.sechead: try: return self.sechead finally: self.sechead = None else: return self.fp.readline() class ini(Extractor): """ Extract messages from INI files. """ def __init__(self, directoryPath, filemasks, options): super(ini, self).__init__(directoryPath, filemasks, options) self.keywords = self.options.get("keywords", []) def extractFromFile(self, filepath): import ConfigParser config = ConfigParser.RawConfigParser() config.readfp(FakeSectionHeader(open(filepath))) for keyword in self.keywords: message = config.get("root", keyword).strip('"').strip("'") context = None position = " ({})".format(keyword) comments = [] yield message, None, context, None, position, comments