Source code for dfttools.parsers.generic

"""
Contains helper routines to parse text.
"""
import re
import sys
import json
    
import numpy

re_float = r"([-+]?[0-9]+\.?[0-9]*(?:[eEdD][-+]?[0-9]*)?)|(nan)"
re_int = r"([-+]?\d+)"
re_line = r"^(.*)$"
re_word = r"(\w+)"
re_varName = r"(\w[\d\w\(\)\._]+)"
re_nonspace = r"([^\s]+)"
re_quotedText = r"(\'.*?\')|(\".*?\")"

cre_float = re.compile(re_float)
cre_int = re.compile(re_int)
cre_line = re.compile(re_line,re.MULTILINE)
cre_word = re.compile(re_word)
cre_varName = re.compile(re_varName)
cre_nonspace = re.compile(re_nonspace)
cre_quotedText = re.compile(re_quotedText,re.DOTALL)

class ParseError(Exception):
    pass
    
[docs]class AbstractParser(object):
    """
    A root class for text parsers.
    
    Args:
    
        data (str): text to parse or a file to read.
    """
    
    def __init__(self, file):
        if hasattr(file,"read"):
            self.file = file
            self.data = file.read()
        else:
            self.file = None
            self.data = file
        self.parser = parse(self.data)
    
    @staticmethod
[docs]    def valid_header(header):
        """
        Checks whether the file header is an expected one. Used in
        automatic determination of file format.
        
        Args:
        
            header (str): the file header;
            
        Returns:
        
            True if the header is as expected.
        """
        raise NotImplementedError
        
    @staticmethod
[docs]    def valid_filename(name):
        """
        Checks whether the file name is an expected one. Used in
        automatic determination of file format.
        
        Args:
        
            name (str): the file name;
            
        Returns:
        
            True if the name is as expected.
        """
        raise NotImplementedError
        
[docs]class AbstractJSONParser(AbstractParser):
    """
    A root class for JSON parsers.
    
    Args:
    
        data (str): text representation of JSON to parse.
    """
    
    def __init__(self, data):
        self.json = json.loads(data)

    def __set_units__(self, field, units):
        self.json[field] = numpy.array(self.json[field])*units
        
[docs]class StringParser(object):
    """
    Simple parser for a string with position memory.
    
    This class can be used to parse words, numbers, floats and arrays
    from a given string. Based on
    `re <http://docs.python.org/2/library/re.html>`_, it provides the
    basic functionality for the rest of parsing libraries.
    
    Args:
    
        string (str): input string to be parsed.
        
    .. note::
        
        The input string can be further accessed by ``self.string``
        field. The contents of the string is not copied.
    
    """
    
    def __init__(self, string):
        self.string = string
        self.__position__ = 0
        self.__history__ = []
        
[docs]    def goto(self, expression, n = 1):
        """
        Goes to the beginning of nth occurrence of expression in the
        string.

        Args:
        
            expression (str,re.RegexObject): expression to match.
            If *expression* is str then the case is ignored.

        Kwargs:
        
            n (int): number of occurrences to match.
        
        Raises:
        
            StopIteration: No occurrences left in the string.
        """
        if isinstance(expression, str):
            expression = re.compile(re.escape(expression),re.I)
        ex_it = expression.finditer(self.string[self.__position__:])
        for i in range(n):
            start = next(ex_it).start()
        self.__position__ += start
        
[docs]    def pop(self):
        """
        Returns to the previously saved position of the parser.
        
        Raises:
        
            IndexError: No saved positions left.
        """
        self.__position__ = self.__history__.pop()
        
[docs]    def save(self):
        """
        Saves the current position of the parser.
        
        Example::
        
            sp = StringParser("A very important integer 123 describes something.")
            
            sp.skip("very") # The caret is set to the right of "very"
            sp.save() # The caret position is saved
            
            sp.skip("describes") # The caret is set to the right of "describes"
            # Now the call to StringParser.nextInt() will yield StopIteration.
            # To return the caret to the previously saved position
            # StringParser.pop() is used.
            sp.pop()
            
            # Now it is possible to read the integer
            sp.nextInt()
        """
        self.__history__.append(self.__position__)
        
[docs]    def skip(self, expression, n = 1):
        """
        Skips n occurrences of expression in the string.

        Args:
        
            expression (str,re.RegexObject): expression to match.
            If *expression* is str then the case is ignored.

        Kwargs:
        
            n (int): number of occurrences to skip.
           
        Raises:
        
            StopIteration: No occurrences left in the string.
        """
        if isinstance(expression, str):
            expression = re.compile(re.escape(expression),re.I)
        ex_it = expression.finditer(self.string[self.__position__:])
        for i in range(n):
            end = next(ex_it).end()
        self.__position__ += end
            
[docs]    def skipAll(self, expression):
        """
        Goes to the end of the last occurrence of a given expression in
        the string.

        Args:
        
            expression (str,re.RegexObject): expression to match.
            If *expression* is str then the case is ignored.
           
        Raises:
        
            StopIteration: No occurrences left in the string.
        """
        if isinstance(expression, str):
            expression = re.compile(re.escape(expression),re.I)
        ex_it = expression.finditer(self.string[self.__position__:])
        end = next(ex_it).end()
        while True:
            try:
                end = next(ex_it).end()
            except StopIteration:
                self.__position__ += end
                return
        
[docs]    def present(self, expression):
        """
        Test the string for the presence of expression.
        
        Args:
        
            expression (str,re.RegexObject): expression to match.
            If *expression* is str then the case is ignored.
        
        Returns:
        
            True if *expression* is matched to the right of current
            position of the caret.
        """
        try:
            self.distance(expression)
            return True
        except StopIteration:
            return False
                
[docs]    def distance(self, expression, n = 1, default = None):
        """
        Calculates distance to nth occurrence of expression in characters.
        
        Args:
        
            expression (str,re.RegexObject): expression to match. If
            *expression* is str then the case is ignored.

        Kwargs:
        
            n (int): consequetive number of expression to calculate
            distance to;
            
            default: return value if StopIteration occurs. Ignored if
            None.
        
        Returns:
        
            Numbers of characters between caret position and *nth*
            occurrence of *expression* or *default* if too few
            occurrences found.
            
        Raises:
        
            StopIteration: No occurrences left in the string.
        """
        if isinstance(expression, str):
            expression = re.compile(re.escape(expression),re.I)
        ex_it = expression.finditer(self.string[self.__position__:])
        try:
            for i in range(n):
                start = next(ex_it).start()
        except StopIteration:
            if default is None:
                raise
            else:
                return default
        return start
        
[docs]    def reset(self):
        """
        Resets the caret to the beginning of the string.
        """
        self.__position__ = 0
        
[docs]    def nextMatch(self, match, n = None):
        """
        Basic function for matching data.
        
        Args:
        
            match (re.RegexObject): object to match;
        
        Kwargs:
        
            n (array,int,str,re.RegexObject): specifies either shape of
            the numpy array returned or the regular expression to stop
            matching before;
                
        Returns:
        
            If *n* is specified returns a numpy array of a given shape
            filled with matches from string. Otherwise returns a single
            match. The caret is put behind the last match.
            
        Raises:
        
            StopIteration: Not enough matches left in the string.
                
        """
        ex_it = match.finditer(self.string[self.__position__:])
        if n is None:
            match = next(ex_it)
            result = match.group()
            self.__position__ += match.end()
            return result
        elif isinstance(n,(int, list, tuple, numpy.ndarray)):
            n_elements = n if isinstance(n,int) else numpy.prod(n)
            result = numpy.zeros(n_elements, dtype = object)
            if result.size > 0:
                for x in range(n_elements):
                    match = next(ex_it)
                    result[x] = match.group()
                self.__position__ += match.end()
            return result.reshape(n)
        else:
            lim = self.distance(n)
            result = []
            end = 0
            while True:
                try:
                    match = next(ex_it)
                    e = match.end()
                    if e<=lim:
                        result.append(match.group())
                        end = e
                    else:
                        break
                except StopIteration:
                    break
            self.__position__ += end
            return numpy.array(result, dtype = object)

[docs]    def matchAfter(self,after,match,n = None):
        """
        Matches pattern after another pattern and returns caret to initial
        position. Particularly useful for getting value for parameter
        name. Supports matching arrays via keyword parameter *n*.
        
        Args:
        
            after (re.RegexObject): pattern to skip;
            
            match (re.RegexObject): pattern to match;
            
        Kwargs:
        
            n (array,int,str,re.RegexObject): specifies either shape of
            the numpy array returned or the regular expression to stop
            matching before;
            
        Returns:
        
            If *n* is specified returns a numpy array of a given shape
            filled with matches from string. Otherwise returns a single
            match.

        Raises:
        
            StopIteration: Not enough matches left in the string.

        The function is equal to
        
            >>> sp = StringParser("Some string")
            >>> sp.save()
            >>> sp.skip(after)
            >>> result = sp.nextMatch(match, n = n)
            >>> sp.pop()
            
        """
        self.save()
        self.skip(after)
        result = self.nextMatch(match, n = n)
        self.pop()
        return result
        
[docs]    def nextInt(self, n = None):
        """
        Reads integers from string.
        
        Kwargs:
        
            n (array,int,str,re.RegexObject): specifies either shape of
            the numpy array returned or the regular expression to stop
            matching before;
                
        Returns:
        
            If *n* is specified returns a numpy array of a given shape
            filled with integers from string. Otherwise returns a single
            int. The caret is put behind the last integer read.
            
        Raises:
        
            StopIteration: Not enough integers left in the string.
            
        Example:
        
            >>> sp = StringParser("1 2 3 4 5 6 7 8 9 abc 10")
            >>> sp.nextInt((2,3))
            array([[1, 2, 3],
                [4, 5, 6]])
            >>> sp.nextInt("abc")
            array([ 7.,  8.,  9.])
                
        """
        result = self.nextMatch(cre_int, n = n)
        if n is None:
            return int(result)
        else:
            return result.astype(numpy.int)
            
[docs]    def intAfter(self, after, n = None):
        """
        Reads integers from string after the next regular expression.
        Returns the caret to initial position. Particularly useful for
        getting value for parameter name.
        
        Args:
        
            after (re.RegexObject) - pattern to skip;
            
        Kwargs:
        
            n (array,int,str,re.RegexObject): specifies either shape of
            the numpy array returned or the regular expression to stop
            matching before;
                
        Returns:
        
            If *n* is specified returns a numpy array of a given shape
            filled with integers from string. Otherwise returns a single
            int.
            
        Raises:
        
            StopIteration: Not enough integers left in the string.
            
        Example:
        
            >>> sp = StringParser("cows = 3, rabbits = 5")
            >>> sp.intAfter("rabbits")
            5
            >>> sp.intAfter("cows")
            3
                
        """
        self.save()
        self.skip(after)
        result = self.nextInt(n = n)
        self.pop()
        return result
        
[docs]    def nextFloat(self, n = None):
        """
        Reads floats from string.
        
        Kwargs:
        
            n (array,int,str,re.RegexObject): specifies either shape of
            the numpy array returned or the regular expression to stop
            matching before;
                
        Returns:
        
            If *n* is specified returns a numpy array of a given shape
            filled with floats from string. Otherwise returns a single
            float. The caret is put behind the last float read.
            
        Raises:
        
            StopIteration: Not enough floats left in the string.
            
        Example:
        
            >>> sp = StringParser("1.9 2.8 3.7 56.2E-2 abc")
            >>> sp.nextFloat(2)
            array([ 1.9, 2.8])
            >>> sp.nextFloat("abc")
            array([ 3.7  ,  0.562])
                
        """
        result = self.nextMatch(cre_float, n = n)
        if n is None:
            return float(result.replace('d','e').replace('D','E'))
        else:
            return result.astype(numpy.float)
            
[docs]    def floatAfter(self, after, n = None):
        """
        Reads floats from string after the next regular expression.
        Returns the caret to initial position. Particularly useful for
        getting value for parameter name.
        
        Args:
        
            after (re.RegexObject) - pattern to skip;
        
        Kwargs:
        
            n (array,int,str,re.RegexObject): specifies either shape of
            the numpy array returned or the regular expression to stop
            matching before;
                
        Returns:
        
            If *n* is specified returns a numpy array of a given shape
            filled with floats from string. Otherwise returns a single
            float.
            
        Raises:
        
            StopIteration: Not enough floats left in the string.
            
        Example:
        
            >>> sp = StringParser("apples = 3.4; bananas = 7")
            >>> sp.floatAfter("bananas")
            7.0
            >>> sp.floatAfter("apples")
            3.4
                
        """
        self.save()
        self.skip(after)
        result = self.nextFloat(n = n)
        self.pop()
        return result

[docs]    def nextLine(self, n = None):
        """
        Reads lines from string.
        
        Kwargs:
        
            n (array,int,str,re.RegexObject): specifies either shape of
            the numpy array returned or the regular expression to stop
            matching before;
                
        Returns:
        
            If *n* is specified returns a numpy array of a given shape
            filled with lines from string. Otherwise returns a single
            line. The caret is put behind the last line read.
            
        Raises:
        
            StopIteration: Not enough lines left in the string.
            
        """
        if self.__position__ == len(self.string):
            raise StopIteration
        result = self.nextMatch(cre_line, n = n)
        if self.__position__ < len(self.string):
            self.__position__ += 1
        return result
        
[docs]    def startOfLine(self):
        """
        Goes to the beginning of the current line.
        """
        if self.__position__ > 0:
            self.__position__ -= 1
        else:
            return
            
        while not self.string[self.__position__] == "\n":
            if self.__position__ == 0:
                return
            self.__position__ -= 1
            
        self.__position__ += 1
        
[docs]    def closest(self, exprs):
        """
        Returns the closest match of a set of expressions.
        
        Args:
        
            exprs (list): a set of expressions being matched.
            
        Returns:
        
            Index of the closest expression. The distance is measured to
            the beginnings of matches. Returns None if none of
            expressions matched.
        
        Example:
        
            >>> sp = StringParser("This is a large string")
            >>> sp.closest(("a","string","this"))
            2
            
        """
        patterns = tuple(re.escape(i) if isinstance(i,str) else i.pattern for i in exprs)
        match = re.search("("+(")|(".join(patterns))+")",self.string[self.__position__:],re.I)
        if match is None:
            return None
        else:
            matchedString = match.group()
            for i in range(len(patterns)):
                if not re.search(patterns[i],matchedString,re.I) is None:
                    return i

parse = StringParser