How to encode English plain-text (consisting only of letters a-z and whitespace) using a 5-bit character encoding in Python? -
in python, there way encode english plain-text (consisting of small letters a-z , whitespace - i.e. total of 27 characters) using 5-bit character-encoding? if yes, please tell me how.
to more specific, have string: s="hello world". after encoding using 5-bit character-encoding in python want save string external file such each of character in file take 5-bits of storage space.
first, you'll need convert characters ascii 5-bit encoding. it's how it. 1 possible straight-forward way:
class toomuchbits(exception): pass def encode_str(data): buf = bytearray() char in data: num = ord(char) # lower case latin letters if num >= 97 , num <= 122: buf.append(num - 96) # space elif num == 32: buf.append(27) else: raise toomuchbits(char) return buf def decode_str(data): buf = bytearray() num in data: if num == 27: buf.append(' ') else: buf.append(chr(num+96)) return bytes(buf)
after have 5-bit numbers can packed 8-bit bytes. this:
# should not more 8 bits = 5 def get_last_bits(value, count): return value & ((1<<count) - 1) def pack(data): buf = bytearray(1) used_bits = 0 num in data: # zeroes special value marking unused bits if not isinstance(num, int) or num <= 0 or num.bit_length() > bits: raise toomuchbits(num) # character fits available bits in current byte if used_bits <= 8 - bits: buf[-1] |= num << used_bits used_bits += bits # character should split 2 different bytes else: # put lowest bit available space buf[-1] |= get_last_bits(num, 8 - used_bits) << used_bits # put highest bits next byte buf.append(num >> (8 - used_bits)) used_bits += bits - 8 return bytes(buf) def unpack(data): buf = bytearray() data = bytearray(data) # characters filled logic , and therefore initialized 0 char_value = 0 char_bits_left = bits byte in data: data_bits_left = 8 while data_bits_left >= char_bits_left: # current character ends in current byte # take bits current data bytes , shift them appropriate position char_value |= get_last_bits(byte, char_bits_left) << (bits - char_bits_left) # discard processed bits byte = byte >> char_bits_left data_bits_left -= char_bits_left # 0 means end of string. it's necessary detect unused space in end of data # it's otherwise possible detect such space 0x0 character if char_value == 0: break # store , initialize character buf.append(char_value) char_value = 0 char_bits_left = bits # collect bits left in current byte if data_bits_left: char_value |= byte char_bits_left -= data_bits_left return buf
this seems work expected:
test_string = "the quick brown fox jumps on lazy dog" encoded = encode_str(test_string) packed = pack(encoded) unpacked = unpack(packed) decoded = decode_str(unpacked) print "test str (len: %d): %r" % (len(test_string), test_string) print "encoded (len: %d): %r" % (len(encoded), encoded) print "packed (len: %d): %r" % (len(packed), packed) print "unpacked (len: %d): %r" % (len(unpacked),unpacked) print "decoded (len: %d): %r" % (len(decoded), decoded)
outputs:
test str (len: 43): 'the quick brown fox jumps on lazy dog' encoded (len: 43): bytearray(b'\x14\x08\x05\x1b\x11\x15\t\x03\x0b\x1b\x02\x12\x0f\x17\x0e\x1b\x06\x0f\x18\x1b\n\x15\r\x10\x13\x1b\x0f\x16\x05\x12\x1b\x14\x08\x05\x1b\x0c\x01\x1a\x19\x1b\x04\x0f\x07') packed (len: 27): '\x14\x95\x1dk\x1ak\x0b\xf9\xae\xdb\xe6\xe1\xadj\x83s?[\xe4\xa6\xa8l\x16t\xde\xe4\x1d' unpacked (len: 43): bytearray(b'\x14\x08\x05\x1b\x11\x15\t\x03\x0b\x1b\x02\x12\x0f\x17\x0e\x1b\x06\x0f\x18\x1b\n\x15\r\x10\x13\x1b\x0f\x16\x05\x12\x1b\x14\x08\x05\x1b\x0c\x01\x1a\x19\x1b\x04\x0f\x07') decoded (len: 43): 'the quick brown fox jumps on lazy dog'
Comments
Post a Comment