diff --git a/Makefile b/Makefile index 00430ea9..61caea3f 100644 --- a/Makefile +++ b/Makefile @@ -11,8 +11,10 @@ RM ?= rm .PHONY: all build \ check clean \ - develop dist doc doc-data djangotest \ - gstest pytest \ + develop dist doc \ + inputrc-no-unicode \ + inputrc-unicode \ + pytest \ rmChangeLog \ test @@ -41,6 +43,9 @@ install: build test check: pytest +#: Build Sphinx HTML documentation +doc: mathics_scanner/data/characters.json + make -C docs html #: Remove derived files clean: diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..d0c3cbf1 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 00000000..6247f7e2 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/api.rst b/docs/source/api.rst new file mode 100644 index 00000000..7368a54b --- /dev/null +++ b/docs/source/api.rst @@ -0,0 +1,76 @@ +=== +API +=== + +.. automodule:: mathics_scanner + :members: is_symbol_name + +Tokenization +============ + +Tokenization is performed by the ``Tokeniser`` class. The ``next`` method +consumes characters from a feeder and returns a token if the tokenization +succeeds. If the tokenization fails an instance of ``TranslateError`` is +raised. + +.. autoclass:: Tokeniser(object) + :members: __init__, incomplete, sntx_message, next + +The tokens returned by ``next`` are instances of the ``Token`` class: + +.. autoclass:: Token(object) + :members: __init__ + :special-members: + +Feeders +======= + +A feeder is an intermediate between the tokeniser and the actual file being scanned. Feeders used by the tokeniser are instances of the ``LineFeeder`` class: + +.. autoclass:: LineFeeder(object) + :members: feed, empty, message, syntax_message + +Specialized Feeders +------------------- + +To read multiple lines of code at a time use the ``MultiLineFeeder`` class: + +.. autoclass:: MultiLineFeeder(LineFeeder) + :members: __init__ + +To read a single line of code at a time use the ``SingleLineFeeder`` class: + +.. autoclass:: SingleLineFeeder(LineFeeder) + :members: __init__ + +To read lines of code from a file use the ``FileLineFeeder`` class: + +.. autoclass:: FileLineFeeder(LineFeeder) + :members: __init__ + +Character Conversions +===================== + +.. automodule:: mathics_scanner.characters + :members: replace_wl_with_plain_text, replace_unicode_with_wl + +The ``mathics_scanner.characters`` module also exposes special dictionaries: + +``named_characters`` + Maps fully qualified names of named characters to their corresponding + code-points in Wolfram's internal representation: + +.. code-block:: python + + for named_char, code in named_characters.items(): + print(f"The named character {named_char} maps to U+{ord(code):X}") + +``aliased_characters`` + Maps the ESC sequence alias of all aliased characters to their corresponding + code points in Wolfram's internal representation. + +mathics_scanner.generate.rl_inputrc +----------------------------------- + +.. automodule:: mathics_scanner.generate.rl_inputrc + :members: generate_inputrc diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 00000000..0c9a3271 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,54 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) +import mathics_scanner + +# -- Project information ----------------------------------------------------- + +project = 'mathics-scanner' +copyright = '2021, The Mathics Team' +author = 'The Mathics Team' + +# The full version, including alpha/beta/rc tags +release = '1.0.1' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ["sphinx.ext.autodoc"] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] diff --git a/implementation.rst b/docs/source/implementation.rst similarity index 74% rename from implementation.rst rename to docs/source/implementation.rst index 559a145a..0e0f664a 100644 --- a/implementation.rst +++ b/docs/source/implementation.rst @@ -1,9 +1,54 @@ -mathics_scanner.characters -========================== +============== +Implementation +============== -This module consists mostly of translation tables between Wolfram's internal -representation and Unicode/ASCII. For maintainability, it was decided to store -this data in a human-readable YAML table (in ``data/named-characters.yml``). +The Tokeniser +============= + +Tokenization is performed by the ``Tokeniser`` class. The most important +method in this class is by far the ``next`` method. This method consumes +characters from the feeder and returns a token (if the tokenization succeeds). + +Tokenization Rules +------------------ + +Tokenization rules can are defined by declaring methods (in the ``Tokeniser`` +class) whose names are preceded by ``t_``, such as in the following example: :: + + def t_SomeRule(self, match): + # Some logic goes here... + pass + +A tokenization rule is supposed to take a regular expression match (the +``match`` parameter of type ``re.Match``) and convert it to an appropriate +token, which is then returned by the method. The rule is also responsible for +updating the internal state of the tokeniser, such as incrementing the ``pos`` +counter. + +A rule is always expected to receive sane input. In other words, deciding which +rule to call is a responsibility of the caller. Rules are are also +automatically called from inside of ``next``. + +Messaging Functionality +----------------------- + +Warnings and errors encountered during scanning and tokenization are collected +in a message queue and stored in the feeders using the ``message`` and +``syntax_message`` methods of ``LineFeeder``. The message queue is therefore a +property of the feeder. The ``Tokeniser`` class also has a method to append +messages to the message queue of it's feeder, the ``syntax_message`` method. + +The messages are stored using Mathics' internal format, but this is going to be +revised in the next release (in fact, we plan to replace messages by errors +entirely). + +Character Conversions +===================== + +The ``mathics_scanner.characters`` module consists mostly of translation tables +between Wolfram's internal representation and Unicode/ASCII. For +maintainability, it was decided to store this data in a human-readable YAML +table (in ``data/named-characters.yml``). The YAML table mainly contains information about how to convert a named character to Unicode and back. If a given character has a direct Unicode diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 00000000..cff35aea --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,22 @@ +Welcome to mathics-scanner's documentation! +=========================================== + +This is the tokeniser or scanner portion for the Wolfram Language. + +As such, it also contains a full set of translation between Wolfram Language +named characters, their Unicode/ASCII equivalents and code-points. + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + usage + api + implementation + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/source/usage.rst b/docs/source/usage.rst new file mode 100644 index 00000000..1211e00a --- /dev/null +++ b/docs/source/usage.rst @@ -0,0 +1,18 @@ +===================== +Using mathics-scanner +===================== + +This is used as the scanner inside `Mathics `_ but it can +also be used for tokenizing and formatting Wolfram Language code. In fact we +intend to write one. This library is also quite usefull if you need to work +with Wolfram Language named character and convert them to various formats. + +- For tokenizing and scanning Wolfram Language code, use the + ``mathics_scanner.Tokenizer`` class. +- To convert between Wolfram Language named characters and Unicode/ASCII, use + the ``mathics_scanner.characters.replace_wl_with_plain_text`` and + ``mathics_scanner.characters.replace_unicode_with_wl`` functions. +- To convert between qualified names of named characters (such ``FormalA`` for + ``\[FormalA]``) and Wolfram's internal representation use the + ``m̀athics_scanner.characters.named_characters`` dictionary. + diff --git a/mathics_scanner/__init__.py b/mathics_scanner/__init__.py index 8052db13..28f711eb 100644 --- a/mathics_scanner/__init__.py +++ b/mathics_scanner/__init__.py @@ -1,6 +1,9 @@ # -*- coding: utf-8 -*- """ -Wolfram-language scanner +This is the tokeniser or scanner portion for the Wolfram Language. + +As such, it also contains a full set of translation between Wolfram Language +named characters, their Unicode/ASCII equivalents and code-points. """ from mathics_scanner.version import __version__ @@ -11,7 +14,8 @@ replace_unicode_with_wl, replace_wl_with_plain_text, ) -from mathics_scanner.tokeniser import is_symbol_name, Tokeniser +# TODO: Move is_symbol_name to the characters module +from mathics_scanner.tokeniser import is_symbol_name, Tokeniser, Token from mathics_scanner.errors import ( InvalidSyntaxError, IncompleteSyntaxError, diff --git a/mathics_scanner/characters.py b/mathics_scanner/characters.py index 7fc6149b..932a0fd5 100644 --- a/mathics_scanner/characters.py +++ b/mathics_scanner/characters.py @@ -1,4 +1,10 @@ # -*- coding: utf-8 -*- +""" +The ``mathics_scanner.characters`` module consists mostly of translation tables +between Wolfram's internal representation of `named characters +`_ +and Unicode/ASCII. +""" import re import ujson @@ -49,8 +55,8 @@ def replace_wl_with_plain_text(wl_input: str, use_unicode=True) -> str: Language named characters. This functions replaces all occurrences of such characters with their corresponding Unicode/ASCII equivalents. - @param: wl_input The string whose characters will be replaced. - @param: use_unicode A flag that indicates whether to use Unicode or ASCII + :param wl_input: The string whose characters will be replaced. + :param use_unicode: A flag that indicates whether to use Unicode or ASCII for the conversion. Note that the occurrences of named characters in ``wl_input`` are expect to @@ -72,7 +78,7 @@ def replace_unicode_with_wl(unicode_input: str) -> str: corresponding Unicode equivalents of such characters with the characters themselves. - @param: unicode_input The string whose characters will be replaced. + :param unicode_input: The string whose characters will be replaced. Note that the occurrences of named characters in the output of ``replace_unicode_with_wl`` are represented using Wolfram's internal diff --git a/mathics_scanner/feed.py b/mathics_scanner/feed.py index dbaa1023..06940969 100644 --- a/mathics_scanner/feed.py +++ b/mathics_scanner/feed.py @@ -78,8 +78,8 @@ class MultiLineFeeder(LineFeeder): def __init__(self, lines, filename=""): """ - @param: lines The source of the feeder (a string). - @param: filename A string that describes the source of the feeder, i.e. + :param lines: The source of the feeder (a string). + :param filename: A string that describes the source of the feeder, i.e. the filename that is being feed. """ super(MultiLineFeeder, self).__init__(filename) @@ -106,8 +106,8 @@ class SingleLineFeeder(LineFeeder): def __init__(self, code, filename=""): """ - @param: code The source of the feeder (a string). - @param: filename A string that describes the source of the feeder, i.e. + :param code: The source of the feeder (a string). + :param filename: A string that describes the source of the feeder, i.e. the filename that is being feed. """ super().__init__(filename) @@ -130,8 +130,8 @@ class FileLineFeeder(LineFeeder): def __init__(self, fileobject, trace_fn=None): """ - @param: fileobject The source of the feeder (a string). - @param: filename A string that describes the source of the feeder, + :param fileobject: The source of the feeder (a string). + :param filename: A string that describes the source of the feeder, i.e. the filename that is being feed. """ super().__init__(fileobject.name) diff --git a/mathics_scanner/generate/rl_inputrc.py b/mathics_scanner/generate/rl_inputrc.py index 4c8517b2..f3ea359a 100755 --- a/mathics_scanner/generate/rl_inputrc.py +++ b/mathics_scanner/generate/rl_inputrc.py @@ -1,7 +1,10 @@ #!/bin python3 """ -Creates GNU Readline inputrc tables for converting WL escape sequences to either -unicode symbols or WL Character strings +Creates GNU Readline inputrc tables for converting Wolfram Language escape +sequences to either unicode symbols or Wolfram Language fully qualified named +characters. See `Named Characters +`_ +for more information on character aliases. """ import sys @@ -21,8 +24,9 @@ def _format(c: str, use_unicode: bool) -> str: def generate_inputrc(fd=sys.stdout, use_unicode=True) -> None: """ - Generates inputrc files that maps WL ESC sequence aliases to their - corresponding plain-text representation (full Unicode or strick ASCII) + Generates inputrc files that maps Wolfram Language ESC sequence aliases to + their corresponding plain-text representation (full Unicode or strick + ASCII) """ for alias in aliased_characters: fd.write(_format(alias, use_unicode)) diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 4e9695bc..df36bc33 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -313,12 +313,12 @@ def is_symbol_name(text): class Token(object): - "A representation of a Wolfram Language token" + "A representation of a Wolfram Language token." def __init__(self, tag, text, pos): """ - @param: tag A string that indicates which type of token this is. - @param: text The actual contents of the token. - @param: pos The position of the token in the input feed. + :param tag: A string that indicates which type of token this is. + :param text: The actual contents of the token. + :param pos: The position of the token in the input feed. """ self.tag = tag self.text = text @@ -338,19 +338,6 @@ def __repr__(self): class Tokeniser(object): """ A tokeniser for the Wolfram Language. - - When subclassing ``Tokeniser``, custom tokenisation rules can be defined by - declaring methods whose names are preceded by ``t_``, such as in the - following example: :: - - class MyTokeniser(Tokeniser): - def t_MyWeirdRule(self, match): - # Your logic goes here... - pass - - In this example, ``t_MyWeirdRule`` is supposed to update the internal state - of the tokeniser and return a ``Token`` with an appropriate tag. ``m̀atch`` - is expected to be an instance of ``re.Match``. """ modes = { "expr": (tokens, token_indices), @@ -359,7 +346,7 @@ def t_MyWeirdRule(self, match): def __init__(self, feeder): """ - @param: feeder An instance of ``LineFeeder`` which will feed characters + :param feeder: An instance of ``LineFeeder`` which will feed characters to the tokeniser. """ self.pos = 0 @@ -370,14 +357,14 @@ def __init__(self, feeder): def _change_mode(self, mode): """ - Set the mode of the tokeniser + Set the mode of the tokeniser. """ self.mode = mode self.tokens, self.token_indices = self.modes[mode] # TODO: Rename this to something that remotetly makes sense? def incomplete(self): - "Get more code from the prescanner and continue" + "Get more code from the prescanner and continue." self.prescanner.incomplete() self.code += self.prescanner.scan() @@ -393,7 +380,7 @@ def sntx_message(self, pos=None): # TODO: Convert this to __next__ in the future? def next(self): - "Returns the next token" + "Returns the next token." self._skip_blank() if self.pos >= len(self.code): return Token("END", "", len(self.code))