How to encode English plain-text (consisting only of letters a-z and whitespace) using a 5-bit character encoding in Python? -


in python, there way encode english plain-text (consisting of small letters a-z , whitespace - i.e. total of 27 characters) using 5-bit character-encoding? if yes, please tell me how.

to more specific, have string: s="hello world". after encoding using 5-bit character-encoding in python want save string external file such each of character in file take 5-bits of storage space.

first, you'll need convert characters ascii 5-bit encoding. it's how it. 1 possible straight-forward way:

class toomuchbits(exception):     pass  def encode_str(data):     buf = bytearray()     char in data:         num = ord(char)          # lower case latin letters         if num >= 97 , num <= 122:             buf.append(num - 96)          # space         elif num == 32:             buf.append(27)          else:             raise toomuchbits(char)      return buf  def decode_str(data):     buf = bytearray()     num in data:         if num == 27:             buf.append(' ')         else:             buf.append(chr(num+96))      return bytes(buf) 

after have 5-bit numbers can packed 8-bit bytes. this:

# should not more 8 bits = 5  def get_last_bits(value, count):     return value & ((1<<count) - 1)  def pack(data):     buf = bytearray(1)     used_bits = 0      num in data:         # zeroes special value marking unused bits         if not isinstance(num, int) or num <= 0 or num.bit_length() > bits:             raise toomuchbits(num)          # character fits available bits in current byte         if used_bits <= 8 - bits:             buf[-1] |= num << used_bits             used_bits += bits          # character should split 2 different bytes         else:             # put lowest bit available space             buf[-1] |= get_last_bits(num, 8 - used_bits) << used_bits             # put highest bits next byte             buf.append(num >> (8 - used_bits))             used_bits += bits - 8      return bytes(buf)  def unpack(data):     buf = bytearray()     data = bytearray(data)      # characters filled logic , and therefore initialized 0     char_value = 0     char_bits_left = bits      byte in data:         data_bits_left = 8          while data_bits_left >= char_bits_left:             # current character ends in current byte             # take bits current data bytes , shift them appropriate position             char_value |= get_last_bits(byte, char_bits_left) << (bits - char_bits_left)              # discard processed bits             byte = byte >> char_bits_left             data_bits_left -= char_bits_left              # 0 means end of string. it's necessary detect unused space in end of data             # it's otherwise possible detect such space 0x0 character             if char_value == 0:                 break              # store , initialize character              buf.append(char_value)             char_value = 0             char_bits_left = bits          # collect bits left in current byte         if data_bits_left:             char_value |= byte             char_bits_left -= data_bits_left      return buf 

this seems work expected:

test_string = "the quick brown fox jumps on lazy dog"  encoded = encode_str(test_string) packed = pack(encoded) unpacked = unpack(packed) decoded = decode_str(unpacked)  print "test str (len: %d): %r" % (len(test_string), test_string) print "encoded (len: %d):  %r" % (len(encoded), encoded) print "packed (len: %d):   %r" % (len(packed), packed) print "unpacked (len: %d): %r" % (len(unpacked),unpacked) print "decoded (len: %d):  %r" % (len(decoded), decoded) 

outputs:

test str (len: 43): 'the quick brown fox jumps on lazy dog' encoded (len: 43):  bytearray(b'\x14\x08\x05\x1b\x11\x15\t\x03\x0b\x1b\x02\x12\x0f\x17\x0e\x1b\x06\x0f\x18\x1b\n\x15\r\x10\x13\x1b\x0f\x16\x05\x12\x1b\x14\x08\x05\x1b\x0c\x01\x1a\x19\x1b\x04\x0f\x07') packed (len: 27):   '\x14\x95\x1dk\x1ak\x0b\xf9\xae\xdb\xe6\xe1\xadj\x83s?[\xe4\xa6\xa8l\x16t\xde\xe4\x1d' unpacked (len: 43): bytearray(b'\x14\x08\x05\x1b\x11\x15\t\x03\x0b\x1b\x02\x12\x0f\x17\x0e\x1b\x06\x0f\x18\x1b\n\x15\r\x10\x13\x1b\x0f\x16\x05\x12\x1b\x14\x08\x05\x1b\x0c\x01\x1a\x19\x1b\x04\x0f\x07') decoded (len: 43):  'the quick brown fox jumps on lazy dog' 

Comments