Get proquint documented

This commit is contained in:
Reid 'arrdem' McKenzie 2021-08-03 19:04:23 -06:00
parent 0bda1dd2f3
commit abc4b8dddb
7 changed files with 330 additions and 151 deletions

16
projects/proquint/BUILD Normal file
View file

@ -0,0 +1,16 @@
py_library(
name = "lib",
srcs = glob(["src/python/**/*.py"]),
imports = [
"src/python",
],
)
py_pytest(
name = "test",
srcs = glob(["test/python/**/*.py"]),
deps = [
":lib",
py_requirement("hypothesis"),
],
)

View file

@ -1,3 +1,53 @@
# Proquint
An alternative implementation to https://github.com/dsw/proquint/tree/master/python, which is kinda garbo.
An implementation of [A Proposal for Proquints](https://arxiv.org/html/0901.4016).
To summarize the paper, traditional decimal and hexadecimal codings are inconvenient for "large" bit-width identifiers.
Decimal and hexadecimal codings offer no obvious dense enunciation and are traditionally presented without segmentation punctuation.
The proquint format is a semantically dense coding for 16 bit hunks fitting within the enunciable space of English.
## Demo
``` python
>>> from proquint import Proquint
>>> Proquint.encode_i16(0)
'babab'
>>> Proquint.encode_i16(1)
'babad'
>>> Proquint.encode_i64(14708250061244963317)
'subiv-gavab-sobiz-noluj'
>>> Proquint.decode('babad')
1
```
## API Overview
### `proquint.Proquint.CONSONANTS`
A string of consonants to use when encoding or decoding proquints.
Must be of length 16.
### `proquint.Proquint.VOWELS`
A string of vowels to use when encoding or decoding proquints.
Must be of length 4.
### `proquint.Proquint.decode(buffer: str) -> int`
Decode a proquint string to an integer value without restriction on bit-width.
### `proquint.Proquint.encode(val: int, width: int) -> str`
Encode an integer into a string which will decode to the same value.
Note that the bit-width must be specified in order to determine the number of required segments.
### `proquint.Proquint.encode_{i16, i32, i64}(val: int) -> str`
Helpers for encoding known-width quantities.
## LICENSE
Copyright Reid 'arrdem' McKenzie August 2021.
Published under the terms of the MIT license.

View file

@ -1,105 +0,0 @@
"""Proquint - pronounceable codings of integers.
Implemented from http://arxiv.org/html/0901.4016
"""
from functools import cache
class Proquint(object):
# Class parameters
################################################################################################
CONSONANTS = "bdfghjklmnprstvz"
VOWELS = "aiou"
BYTEORDER = "big"
# Implementation helpers
################################################################################################
@classmethod
@cache
def _consonant_to_uint(cls, c: str) -> int:
if idx := cls.CONSONANTS.index(c) == -1:
raise KeyError
return idx
@classmethod
@cache
def _vowel_to_uint(cls, c: str) -> int:
if idx := cls.VOWELS.index(c) == -1:
raise KeyError
return idx
@classmethod
def _encode(cls, buffer: bytes) -> str:
for n, m in zip(buffer[0::2], buffer[1::2]):
n = n << 16 | m
c1 = n & 0x0F
v1 = (n >> 4) & 0x03
c2 = (n >> 6) & 0x0F
v2 = (n >> 10) & 0x03
c3 = (n >> 12) & 0x0F
yield f"{cls.CONSONANTS[c1]}{cls.VOWELS[v1]}{cls.CONSONANTS[c2]}{cls.VOWELS[v2]}{cls.CONSONANTS[c3]}"
# Core methods
################################################################################################
@classmethod
def encode_bytes(cls, buffer: bytes) -> str:
"""Encode a sequence of bytes into a proquint string.
>>>
"""
return "-".join(cls._encode(buffer))
@classmethod
def decode(cls, buffer: str) -> int:
"""Convert proquint string identifier into corresponding 32-bit integer value.
>>> hex(Proquint.decode('lusab-babad'))
'0x7F000001'
"""
res = 0
for i, c in enumerate([c for c in buffer if c != '-']):
if mag := cls._consonant_to_uint(c) is not None:
res <<= 4
res += mag
else:
mag = cls._vowel_to_uint(c)
if mag is not None:
res <<= 2
res += mag
elif i != 5:
raise ValueError('Bad proquint format')
return res
# Handy aliases
################################################################################################
@classmethod
def encode(cls, val: int, width: int, byteorder=BYTEORDER):
"""Encode an integer into a proquint string."""
if width % 8 != 0 or width < 8:
raise ValueError(f"Width must be a positive power of 2 greater than 8")
return cls.encode_bytes(val.to_bytes(width // 8, byteorder))
@classmethod
def encode_i16(cls, val: int):
"""Encode a 16bi int to a proquint string."""
return cls.encode(val, 16)
@classmethod
def encode_i32(cls, val: int):
"""Encode a 32bi int to a proquint string."""
return cls.encode(val, 32)
@classmethod
def encode_i64(cls, val: int):
"""Encode a 64bi int into a proquint string."""
return cls.encode(val, 64)

View file

@ -1,33 +1,18 @@
"""A setuptools based setup module.
"""
# io.open is needed for projects that support Python 2.7
# It ensures open() defaults to text mode with universal newlines,
# and accepts an argument to specify the text encoding
# Python 3 only projects can skip this import
from io import open
from os import path
# Always prefer setuptools over distutils
from setuptools import find_packages, setup
here = path.abspath(path.dirname(__file__))
# Get the long description from the README file
with open(path.join(here, "README.md"), encoding="utf-8") as f:
long_description = f.read()
# Arguments marked as "Required" below must be included for upload to PyPI.
# Fields marked as "Optional" may be commented out.
setup(
name="proquint", # Required
version="0.1.0", # Required
description="Enunciable numerics",
long_description=long_description, # Optional
long_description_content_type="text/markdown", # Optional (see note above)
name="arrdem.proquint",
version="0.1.0",
description="Enunciable numeric identifiers",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/arrdem/source",
author="Reid 'arrdem' McKenzie",
author_email="me@arrdem.com",
@ -37,32 +22,13 @@ setup(
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"License :: OSI Approved :: BSD License",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.9",
],
# This field adds keywords for your project which will appear on the
# project page. What does your project relate to?
#
# Note that this is a string of words separated by whitespace, not a list.
keywords="sample setuptools development", # Optional
# You can just specify package directories manually here if your project is
# simple. Or you can use find_packages().
#
# Alternatively, if you just want to distribute a single Python file, use
# the `py_modules` argument instead as follows, which will expect a file
# called `my_module.py` to exist:
#
# py_modules=["my_module"],
#
packages=find_packages(exclude=["docs", "tests"]),
python_requires=">=3.5",
# List additional groups of dependencies here (e.g. development
# dependencies). Users will be able to install these using the "extras"
# syntax, for example:
#
# $ pip install sampleproject[dev]
#
# Similar to `install_requires` above, these must be valid existing
# projects.
packages=[
"proquint",
],
package_dir={"": "src/python"},
python_requires=">=3.9",
extras_require={ # Optional
"dev": ["check-manifest"],
"test": ["pytest", "hypothesis"],

View file

@ -0,0 +1,164 @@
"""Proquint - pronounceable codings of integers.
Implemented from http://arxiv.org/html/0901.4016
Quoting -
we propose encoding a 16-bit string as a proquint of alternating consonants and vowels as follows.
Four-bits as a consonant:
0 1 2 3 4 5 6 7 8 9 A B C D E F
b d f g h j k l m n p r s t v z
Two-bits as a vowel:
0 1 2 3
a i o u
Whole 16-bit word, where "con" = consonant, "vo" = vowel:
0 1 2 3 4 5 6 7 8 9 A B C D E F
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|con |vo |con |vo |con |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Separate proquints using dashes, which can go un-pronounced or be pronounced "eh". The suggested optional magic number prefix to a sequence of proquints is "0q-".
Here are some IP dotted-quads and their corresponding proquints.
127.0.0.1 lusab-babad
63.84.220.193 gutih-tugad
63.118.7.35 gutuk-bisog
140.98.193.141 mudof-sakat
64.255.6.200 haguz-biram
128.30.52.45 mabiv-gibot
147.67.119.2 natag-lisaf
212.58.253.68 tibup-zujah
216.35.68.215 tobog-higil
216.68.232.21 todah-vobij
198.81.129.136 sinid-makam
12.110.110.204 budov-kuras
"""
from functools import cache
class Proquint(object):
# Class parameters
################################################################################################
CONSONANTS = "bdfghjklmnprstvz"
VOWELS = "aiou"
# Implementation helpers
################################################################################################
@classmethod
@cache
def _consonant_to_uint(cls, c: str) -> int:
try:
return cls.CONSONANTS.index(c)
except ValueError:
return
@classmethod
@cache
def _vowel_to_uint(cls, c: str) -> int:
try:
return cls.VOWELS.index(c)
except ValueError:
return
@classmethod
def _encode(cls, buffer: bytes) -> str:
# This is a bit tricky.
# Proquints are encoded not out of 8bi quantities but out of 16bi quantities.
#
# Example from the proposal:
#
# 0 1 2 3 4 5 6 7 8 9 A B C D E F
# +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
# |con |vo |con |vo |con |
# +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
#
# Now, while this is the bit-order interpretation, note it's left-to-right not right-to-left
# as english is written. This means that the highest order bits in RTL will be written
# first, so the chunks are 0xC, 0xA, 0x6, 0x4, 0x0
for n, m in zip(buffer[0::2], buffer[1::2]):
# Rebuild the two 8bi pairs into a 16bi chunk
val = n << 8 | m
# This is slightly un-idiomatic, but it precisely captures the coding definition
yield "".join([
dict[val >> shift & mask]
for dict, shift, mask in [
(cls.CONSONANTS, 0xC, 0xf),
(cls.VOWELS, 0xA, 0x3),
(cls.CONSONANTS, 0x6, 0xf),
(cls.VOWELS, 0x4, 0x3),
(cls.CONSONANTS, 0x0, 0xf)
]
])
# Core methods
################################################################################################
@classmethod
def encode_bytes(cls, buffer: bytes) -> str:
"""Encode a sequence of bytes into a proquint string.
>>>
"""
return "-".join(cls._encode(buffer))
@classmethod
def decode(cls, buffer: str) -> int:
"""Convert proquint string identifier into corresponding 32-bit integer value.
>>> hex(Proquint.decode('lusab-babad'))
'0x7F000001'
"""
res = 0
for i, c in enumerate([c for c in buffer if c != '-']):
if (mag := cls._consonant_to_uint(c)) is not None:
res <<= 4
res += mag
else:
mag = cls._vowel_to_uint(c)
if mag is not None:
res <<= 2
res += mag
elif i != 5:
raise ValueError('Bad proquint format')
return res
# Handy aliases
################################################################################################
@classmethod
def encode(cls, val: int, width: int):
"""Encode an integer into a proquint string."""
if width % 8 != 0 or width < 8:
raise ValueError(f"Width must be a positive power of 2 greater than 8")
return cls.encode_bytes(val.to_bytes(width // 8, "big"))
@classmethod
def encode_i16(cls, val: int):
"""Encode a 16bi int to a proquint string."""
return cls.encode(val, 16)
@classmethod
def encode_i32(cls, val: int):
"""Encode a 32bi int to a proquint string."""
return cls.encode(val, 32)
@classmethod
def encode_i64(cls, val: int):
"""Encode a 64bi int into a proquint string."""
return cls.encode(val, 64)

View file

@ -0,0 +1,57 @@
"""Tests based off of known examples."""
import proquint
import pytest
examples = [
# Various single-bit data
(1, 32, "babab-babad"),
(2, 32, "babab-babaf"),
(4, 32, "babab-babah"),
(8, 32, "babab-babam"),
(16, 32, "babab-babib"),
(32, 32, "babab-babob"),
(64, 32, "babab-badab"),
(128, 32, "babab-bafab"),
(256, 32, "babab-bahab"),
(512, 32, "babab-bamab"),
(1024, 32, "babab-bibab"),
(2048, 32, "babab-bobab"),
(4096, 32, "babab-dabab"),
(8192, 32, "babab-fabab"),
(16384, 32, "babab-habab"),
(32768, 32, "babab-mabab"),
(65536, 32, "babad-babab"),
(131072, 32, "babaf-babab"),
(262144, 32, "babah-babab"),
(524288, 32, "babam-babab"),
(1048576, 32, "babib-babab"),
(2097152, 32, "babob-babab"),
(4194304, 32, "badab-babab"),
(8388608, 32, "bafab-babab"),
(16777216, 32, "bahab-babab"),
(33554432, 32, "bamab-babab"),
(67108864, 32, "bibab-babab"),
(134217728, 32, "bobab-babab"),
(268435456, 32, "dabab-babab"),
(536870912, 32, "fabab-babab"),
(1073741824, 32, "habab-babab"),
(2147483648, 32, "mabab-babab"),
# A random value
(3232235536, 32, "safom-babib"),
]
@pytest.mark.parametrize('val,width,qint', examples)
def test_decode_examples(val, width, qint):
assert proquint.Proquint.decode(qint) == val, f"qint {qint} did not decode"
@pytest.mark.parametrize('val,width,qint', examples)
def test_encode_examples(val, width, qint):
encoded_qint = proquint.Proquint.encode(val, width)
decoded_val = proquint.Proquint.decode(encoded_qint)
assert encoded_qint == qint, f"did not encode {val} to {qint}; got {encoded_qint} ({decoded_val})"

View file

@ -0,0 +1,31 @@
"""Tests based off of round-tripping randomly generated examples."""
import proquint
import pytest
from hypothesis import given
from hypothesis.strategies import integers
@given(integers(min_value=0, max_value=1<<16))
def test_round_trip_16(val):
assert proquint.Proquint.decode(
proquint.Proquint.encode(val, 16)) == val
@given(integers(min_value=0, max_value=1<<32))
def test_round_trip_32(val):
assert proquint.Proquint.decode(
proquint.Proquint.encode(val, 32)) == val
@given(integers(min_value=0, max_value=1<<64))
def test_round_trip_64(val):
assert proquint.Proquint.decode(
proquint.Proquint.encode(val, 64)) == val
@given(integers(min_value=0, max_value=1<<512))
def test_round_trip_512(val):
assert proquint.Proquint.decode(
proquint.Proquint.encode(val, 512)) == val