base大家族详解
*/-->
pre.src {background-color: #292b2e; color: #b2b2b2;}
pre.src {background-color: #292b2e; color: #b2b2b2;}
base大家族详解
1 简介
对于base编码的使用,参考Base What? A Practical Introduction to Base Encoding。
- base2: 2进制
- base10: 10进制
- base16: 计算机字节表示
- base32: 字母加数字
- base64: 更大的表示范围
- base58: bitcoin为了比base64更易读采用的表示
这些base编码主要是在计算机的二进制表示,和人可读的字符表示之间转换。
具体实现可以分为2种,能利用bit位直接转换的base(例如base2, base16,base32, base64),和不能利用bit位直接转换的base(例如base10, base58)。
2 实现代码
全部采用clojure实现,主要是为了理解base编码,只求代码容易理解,性能不是很好:
(ns base) (defn indices-of
[f coll]
(keep-indexed #(if (f %2) %1 nil) coll)) (defn first-index-of
[f coll]
(first (indices-of f coll))) (defn find-thing
"查找`coll`中第一个等于value的索引"
[value coll]
(first-index-of #(= % value) coll)) (def divmod "返回[商 余数]" (juxt quot mod)) ;; 1. 不能利用bit位直接转换的base
;; 基本编码方法就是把输入作为一个数字,不断的对baseN的N进行除法运算,
;; 每次的余数就是base-table的索引,商作为下一次的除数进行计算。直到除尽 (defn base-encode-num
"base编码一个数字`n`
`leading-zeros`为padding个数
`base-table`为编码转换表"
[n leading-zeros base-table]
(let [base-count (count base-table)]
(loop [n n
result ""]
(if (>= n base-count)
(let [[d m] (divmod n base-count)]
(recur d
;; 编码是从后往前进行的,每次除以baseN的N,
;; 以余数作为索引
(-> (nth base-table m)
(str result))))
(->> (if (pos? n)
(-> (nth base-table n)
(str result))
result)
;; 注意,因为是从后往前进行生成的,padding在前面添加
(concat (repeat leading-zeros (first base-table)))
(apply str)))))) (defn base-enc
"编码base10,base36,base58等"
[base-table s]
(let [s-bytes (.getBytes s)
leading-zeros (->> s-bytes
(take-while zero?)
count)]
(-> (java.math.BigInteger. s-bytes)
(base-encode-num leading-zeros base-table)))) (defn invert-table
"生成`table`序列的反向索引表 "
[table]
(into {}
(map #(vector %1 %2)
table
(iterate inc )))) ;; baseN的解码方法就是对输入字符串的每个字符查找索引 * (N ** 输入串反向顺序的索引),
;; 结果累加就是目标数字,再把数字还原为字符串 (defn base-dec
"解码base10,base36,base58等"
[base-table s]
(let [padding (->> s
(take-while #(= % (first base-table)))
(map (constantly (byte ))))
inverted-table (invert-table base-table)
base-count (count base-table)]
(->> (reverse s)
(reduce (fn [[r m] c]
[(+ r (*' m (inverted-table c)))
(*' m base-count)]) [ ])
first
str
java.math.BigInteger.
.toByteArray
(drop-while zero?)
(concat padding)
byte-array
String.))) (def base58-table "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz")
(def enc-base58 (partial base-enc base58-table))
(def dec-base58 (partial base-dec base58-table)) (def base36-table "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ")
(def enc-base36 (partial base-enc base36-table))
(def dec-base36 (partial base-dec base36-table)) (def base10-table "0123456789")
(def enc-base10 (partial base-enc base10-table))
(def dec-base10 (partial base-dec base10-table)) (comment
(def target "D9cS9N9iHjMLTdA8YSMRMp") (def r (dec-base58 target)) (= target (enc-base58 r)) (= "64021609848" (enc-base36 "testabc")) (enc-base10 "this is a test")
;; => "2361031878030638688519054699098996" (dec-base10 "2361031878030638688519054699098996")
;; => "this is a test" ) ;;; ============= 以2为基的base算法,base2, base4, base16, base32, base64 ...
;;; 可以利用bit位直接变换的base
;;; 算法把输入的8bit串转换为base(2^n)的nbit串,不需要做除法,
;;; 编码过程 直接把输入字符串转换为bit序列,再根据baseN的位宽(如base64 6位,base32 5位)重新划分,
;;; 查表获取目标串,不需要除法,因此要比上面的base算法速度快
;;; 下面统一称为base2 (defn bits
"转换数字为bit序列,如果不指定`width`,默认为8位"
([n] (bits n ))
([n width]
(reverse (map #(bit-and (bit-shift-right n %) ) (range width))))) (defn numb
"bit序列转换为数字"
[bits]
(BigInteger. (apply str bits) )) (defn string-to-bits
"字符串转换为bit序列"
[msg]
(->> (.getBytes msg)
(map #(bits %))
flatten)) (defn padding-count
"获取字节序列`v`的padding个数"
[v]
(->> (vec v)
rseq
(take-while zero?)
count)) (defn drop-padding
"去掉字节序列`v`的padding"
[v]
(-> (padding-count v)
(drop-last v))) (defn log2
[n]
(/ (Math/log n)
(Math/log ))) (defn get-bit-width
"获取`base-table`的位宽,即用多少字节表示一个字符"
[base-table]
(let [width (-> (count base-table)
log2)]
(when (= width (Math/floor width))
(int width)))) (defn valid-base2-table?
"检测是否为有效的base2表"
[base-table]
(-> (get-bit-width base-table)
nil?
not)) (defn byte-align
"根据位宽bit-width,获取需要对齐的base字节数
比如一个字节是8位,转换base64,每个base64字节为6位
byte-align就是确定最低需要多少个base64的6位字节才能对齐到8位字节"
[bit-width]
(let [max-padding
fn-len-range (fn [width]
(map (partial * width) (range max-padding)))
hex-range (fn-len-range )
bit-range (fn-len-range bit-width)
align-value (->> (clojure.set/intersection (set hex-range) (set bit-range))
(apply min))]
(/ align-value bit-width))) (defn hex-byte-align
"获取最低需要多少个hex字节才能对齐"
[bit-width]
(-> (byte-align bit-width)
(* bit-width)
(/ ))) (comment (byte-align )
;; => 4 (hex-byte-align )
;; => 3
;; base64需要4个6位字节才能对齐到3个8位hex字节,即 4 * 6 = 3 * 8 (byte-align )
;; => 8 (hex-byte-align )
;; => 5
;; base32需要8个5位字节才能对齐到5个8位hex字节, 即 8 * 5 = 5 * 8 (hex-byte-align )
;; => 1
(hex-byte-align )
;; => 1
(hex-byte-align )
;; => 1
;; 可以看到base2, base4,base16 可以对齐到1个字节,不需要padding ) (defn gen-padding
"生成`data`的padding串"
[bit-width data]
(let [len (count data)
align (byte-align bit-width)
aligned-byte (mod len align)]
(if (zero? aligned-byte)
""
(apply str
(-> (- align aligned-byte)
(repeat "=")))))) (defn base2-enc
"编码base2类型的字符串`s`
`base-table`必须是以2为基的base表,base(2^n),例如base16,base32,base64"
[base-table s]
{:pre [(valid-base2-table? base-table)]}
(let [bits (string-to-bits s)
base2-char #(->> (numb %)
(nth base-table))
bit-width (get-bit-width base-table)
data (partition bit-width bit-width (repeat ) bits)]
(str (->> (map base2-char data)
(apply str))
(gen-padding bit-width data)))) (defn base2-dec
"base2解码,`s`为要解码的字符串
`base-table` 解码表"
[base-table s]
{:pre [(valid-base2-table? base-table)]}
(let [bit-width (get-bit-width base-table)
inverted-table (merge (invert-table base-table)
{\= } ;; 添加padding字符
)]
(->> (mapcat #(some-> (inverted-table %1)
(bits bit-width))
s)
;; mapcat自动过滤掉base2-bits的nil值,即忽略s串中不属于base-table的字符
(partition (repeat ))
(map numb)
drop-padding
(map char)
(apply str)))) ;; base64,base32 需要使用base2-enc dec
(def base64-table "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/")
(def enc-base64 (partial base2-enc base64-table))
(def dec-base64 (partial base2-dec base64-table)) (def base32-table "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567")
(def enc-base32 (partial base2-enc base32-table))
(def dec-base32 (partial base2-dec base32-table)) ;; base16等同于hex转换
(def base16-table "0123456789ABCDEF")
(def enc-base16 (partial base2-enc base16-table))
(def dec-base16 (partial base2-dec base16-table)) ;; base2是2进制表示
(def base2-table "01")
(def enc-base2 (partial base2-enc base2-table))
(def dec-base2 (partial base2-dec base2-table)) ;; 自动定义base的宏
(defmacro defbase
"根据base-table生成base编码和解码函数的定义"
[base-name base-table]
(let [table-name (symbol (str base-name "-table"))
enc-fname (symbol (str "enc-" base-name))
dec-fname (symbol (str "dec-" base-name))
def-table `(def ~table-name ~base-table)]
(if-let [bit-width (get-bit-width base-table)]
`(do ~def-table
(def ~enc-fname (partial base2-enc ~base-table))
(def ~dec-fname (partial base2-dec ~base-table)))
`(do ~def-table
(def ~enc-fname (partial base-enc ~base-table))
(def ~dec-fname (partial base-dec ~base-table)))))) ;; (defbase base64 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/")
;; (defbase base32 "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567")
;; (defbase base16 "0123456789ABCDEF")
;; (defbase base2 "01")
;; (defbase base58 "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz")
;; (defbase base36 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ")
;; (defbase base10 "0123456789") (defbase base8 "01234567") (comment base8-table
;; => "01234567" (enc-base8 "test 1")
;; => "3506256335020061" (dec-base8 *1)
;; => "test 1" (= (padding-count [ ])) (= (padding-count [ ])) (= (padding-count [])) (= [ ] (drop-padding [ ])) (= "I" (base2-dec base64-table "SQ==")) (= "AM" (base2-dec base64-table "QU0=")) (base2-enc base64-table "I")
;; => "SQ==" (base2-enc base64-table "AM")
;; => "QU0=" (String. (base64/encode "I"))
;; => "SQ==" (String. (base64/encode "AM"))
;; => "QU0=" (= "this is a test" (base2-dec base32-table "ORUGS4ZANFZSAYJAORSXG5A=")) (base32/encode "this is a test")
;; => "ORUGS4ZANFZSAYJAORSXG5A=" (base2-enc base32-table "this is a test")
;; => "ORUGS4ZANFZSAYJAORSXG5A=" ;; base-table不符,产生异常
(= "base58_is_boring" (base2-dec base58-table "D9cS9N9iHjMLTdA8YSMRMp")) (= (get-bit-width base64-table)) (= (get-bit-width base32-table)) (= nil (get-bit-width base58-table)) (enc-base32 "I") ;; base32的padding比较多
;; => "JE======" (dec-base32 *1)
;; => "I" ;; base16相当于内存hex表示
(enc-base16 "this is a test")
;; => "7468697320697320612074657374" (dec-base16 "7468697320697320612074657374")
;; => "this is a test" (dec-base16 "7468IIOO697320697320612074657374") ;; 不合法字符直接忽略
;; => "this is a test" (codecs/bytes->hex (codecs/str->bytes "this is a test"))
;; => "7468697320697320612074657374" ;; base2相当于内存的二进制表示
(enc-base2 "t")
;; => "01110100" ;; \t的二进制
(Integer/toString (int \t) )
;; => "1110100" (dec-base2 "01110100")
;; => "t"
;; 注意解码时前面的0是必要的 (def s1 "this is a test")
(= (enc-base64 s1) (String. (base64/encode s1))) (= (enc-base32 s1) (base32/encode s1)) ) ;;; ============== 自动base解码 (defn valid-base-str?
"检查字符串s是否符合base-table"
[base-table s]
(clojure.set/subset? (set s)
(-> (set base-table)
(conj \=)))) (defn guess-base
[text]
(cond
;; 这里必须按从小到大的顺序测试
(valid-base-str? base16-table text) :base16
(valid-base-str? base32-table text) :base32
(valid-base-str? base64-table text) :base64
:else :unknown)) (defn decode
"自动base解码"
[text & {:keys [step debug]
:or {step
debug false}}]
(loop [text text
step step]
(when debug
(println "step" step "decode:" text))
(case (guess-base text)
:base16 (do (println "step" step "--> base16")
(-> (dec-base16 text)
(recur (inc step))))
:base32 (do (println "step" step "--> base32")
(-> (dec-base32 text)
(recur (inc step))))
:base64 (do (println "step" step "--> base64")
(-> (dec-base64 text)
(recur (inc step))))
:unknown (do (println "result:" text)
text)))) (comment
(def text "3441353234343435353735323442353634423539354134353336353333323536343735323444343535333537344234433441343634353535353535323533343734413335344134363439353534423530344135413437353634463444353334463441344534443435333235363533353534423531354134363431353334413335") ;; 经测试,直接转换bits对于长字符串速度比较慢
(decode text)
;; step 1 --> base16
;; step 2 --> base16
;; step 3 --> base32
;; step 4 --> base32
;; step 5 --> base64
;; step 6 --> base64
;; result: key:hi!venus
;; => "key:hi!venus" )
3 结论
base编码基本是码表转换,产生人可读的字符表示,不适合加密。
作者: ntestoc
Created: 2019-04-11 四 10:28