trival

avoid allocating large buf on stack
Merge branch 'master' of https://github.com/wangyu-/udp2raw-tunnel
2025-09-15 11:44:27 +08:00 · 2017-08-26 06:56:30 -05:00 · 2017-08-26 06:35:47 -05:00 · 2017-08-26 05:38:40 -05:00 · 2017-08-26 05:38:33 -05:00 · 2017-08-25 19:14:17 -07:00
42 changed files with 13712 additions and 588 deletions
--- a/.cproject
+++ b/.cproject
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+	<storageModule moduleId="org.eclipse.cdt.core.settings">
+		<cconfiguration id="cdt.managedbuild.toolchain.gnu.base.1051378038">
+			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.toolchain.gnu.base.1051378038" moduleId="org.eclipse.cdt.core.settings" name="Default">
+				<externalSettings/>
+				<extensions>
+					<extension id="org.eclipse.cdt.core.GNU_ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+				</extensions>
+			</storageModule>
+			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+				<configuration buildProperties="" id="cdt.managedbuild.toolchain.gnu.base.1051378038" name="Default" parent="org.eclipse.cdt.build.core.emptycfg">
+					<folderInfo id="cdt.managedbuild.toolchain.gnu.base.1051378038.1421447843" name="/" resourcePath="">
+						<toolChain id="cdt.managedbuild.toolchain.gnu.base.1854135910" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.base">
+							<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.GNU_ELF" id="cdt.managedbuild.target.gnu.platform.base.708367396" name="Debug Platform" osList="linux,hpux,aix,qnx" superClass="cdt.managedbuild.target.gnu.platform.base"/>
+							<builder id="cdt.managedbuild.target.gnu.builder.base.1743684210" managedBuildOn="false" name="Gnu Make Builder.Default" superClass="cdt.managedbuild.target.gnu.builder.base"/>
+							<tool id="cdt.managedbuild.tool.gnu.archiver.base.1848194835" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
+							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.base.1873425854" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.base"/>
+							<tool id="cdt.managedbuild.tool.gnu.c.compiler.base.1356109619" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.base"/>
+							<tool id="cdt.managedbuild.tool.gnu.c.linker.base.1018655568" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.base"/>
+							<tool id="cdt.managedbuild.tool.gnu.cpp.linker.base.180014749" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.base"/>
+							<tool id="cdt.managedbuild.tool.gnu.assembler.base.2017907772" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.base"/>
+						</toolChain>
+					</folderInfo>
+				</configuration>
+			</storageModule>
+			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+		</cconfiguration>
+	</storageModule>
+	<storageModule moduleId="scannerConfiguration">
+		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.base.436825263;cdt.managedbuild.toolchain.gnu.base.436825263.480908490;cdt.managedbuild.tool.gnu.cpp.compiler.base.1940762076;cdt.managedbuild.tool.gnu.cpp.compiler.input.997669137">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.base.436825263;cdt.managedbuild.toolchain.gnu.base.436825263.480908490;cdt.managedbuild.tool.gnu.c.compiler.base.233419498;cdt.managedbuild.tool.gnu.c.compiler.input.460189617">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		</scannerConfigBuildInfo>
+	</storageModule>
+	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+		<project id="udp2raw.null.1592488805" name="udp2raw"/>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
+</cproject>
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1 @@
+lib/aes_acc/asm/* linguist-vendored
--- a/.project
+++ b/.project
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>udp2raw-tunnel-desktop</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
+			<triggers>clean,full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
+			<triggers>full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.cdt.core.cnature</nature>
+		<nature>org.eclipse.cdt.core.ccnature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
+	</natures>
+</projectDescription>
--- a/README.md
+++ b/README.md
@@ -1,26 +1,33 @@
 # Udp2raw-tunnel
 ![image0](images/image0.PNG)

-A UDP Tunnel which tunnels UDP via FakeTCP/UDP/ICMP Traffic by using Raw Socket,helps you Bypass UDP FireWalls(or Unstable UDP Environment).Its Encrpyted,Anti-Replay and Multiplexed.It aslo acts as a Connection Stablizer.
+A Tunnel which tunnels UDP via FakeTCP/UDP/ICMP Traffic by using Raw Socket, helps you Bypass UDP FireWalls(or Unstable UDP Environment). Its Encrypted, Anti-Replay and Multiplexed. It also acts as a Connection Stabilizer.

 [简体中文](/doc/README.zh-cn.md)
+# Support Platforms
+Linux host (including desktop Linux,Android phone/tablet,OpenWRT router,or Raspberry PI) with root access.
+
+For Winodws/MacOS,the 4.4mb virtual image with udp2raw pre-installed has been released,you can load it with Vmware/VirtualBox.The virtual image has been set to auto obtain ip,udp2raw can be run imidiately after boot finished(make sure network mode of virtual machine has been set to bridged)(only udp2raw has to be run under virtual machine,all other programs runs under Windows/MacOS as usual).
+
+
 # Features 
-### Send / Receive UDP Packet with fake-tcp/icmp headers
-Fake-tcp/icmp headers help you bypass UDP blocking, UDP QOS or improper UDP NAT behavior on some ISPs. Raw packets with UDP headers are also supported.In UDP header mode,it behaves just like a normal UDP tunnel,and you can just make use of the other features.
+### Send/Receive UDP Packets with ICMP/FakeTCP/UDP headers
+ICMP/FakeTCP headers help you bypass UDP blocking, UDP QOS or improper UDP NAT behavior on some ISPs. In ICMP header mode,udp2raw works like an ICMP tunnel. 

-### Simulate TCP Handshake
-Simulates the 3-way handshake, along with seq and ack_seq. TCP options MSS, sackOk, TS, TS_ack, wscale are also simulated. Real-time delivery guaranteed, no TCP over TCP problem when using OpenVPN.
+UDP headers are also supported. In UDP header mode, it behaves just like a normal UDP tunnel, and you can just make use of the other features (such as encrytion, anti-replay, or connection stalization).

-### Encrpytion, Anti-Replay, No MITM
+### Simulated TCP with Real-time/Out-of-Order Delivery
+In FakeTCP header mode,udp2raw simulates 3-way handshake while establishing a connection,simulates seq and ack_seq while data transferring. It also simulates following TCP options: `MSS`, `sackOk`, `TS`, `TS_ack`, `wscale`.Firewalls will regard FakeTCP as a TCP connection, but its essentially UDP: it supports real-time/out-of-order delivery(just as normal UDP does), no congrestion control or re-transmission. So there wont be any TCP over TCP problem when using OpenVPN.
+
+### Encrpytion, Anti-Replay
 * Encrypt your traffic with AES-128-CBC.
 * Protect data integrity by MD5 or CRC32.
 * Defense replay attack with an anti-replay window, smiliar to IPSec and OpenVPN. 
-* Authenticate mutually, no MITM attacks.

 ### Failure Dectection & Stablization (Connection Recovery)
-Conection failures are detected by heartbeats. If timed-out,client will automatically change port number and reconnect. If reconnection is successful, the previous connection will be recovered, and all existing UDP conversations will stay vaild. 
+Conection failures are detected by heartbeats. If timed-out, client will automatically change port number and reconnect. If reconnection is successful, the previous connection will be recovered, and all existing UDP conversations will stay vaild. 

-For example, if you use UDP2RAW + OpenVPN, OpenVPN won't lose connection after any reconnect, **even if the network cable is re-plugged or the WiFi access point is changed**.
+For example, if you use udp2raw + OpenVPN, OpenVPN won't lose connection after any reconnect, **even if network cable is re-plugged or WiFi access point is changed**.

 ### Other Features
 * **Multiplexing** One client can handle multiple UDP connections, all of which share the same raw connection.
@@ -29,26 +36,14 @@ For example, if you use UDP2RAW + OpenVPN, OpenVPN won't lose connection after a

 * **NAT Support** All of the 3 modes work in NAT environments.

-* **OpenVZ Support** Tested on BandwagonHost.
+* **OpenVZ Support** Tested on BandwagonHost VPS.

-* **OpenWRT Support** No dependencies, easy to build. Binary for ar71xx are included in release.
+* **Easy to Build** No dependencies.To cross-compile udp2raw,all you need to do is just to download a toolchain,modify makefile to point at the toolchain,run `make cross` then everything is done.(Note:Pre-compiled binaries for Desktop,RaspberryPi,Android,some Openwrt Routers are already included in [Releases](https://github.com/wangyu-/udp2raw-tunnel/releases))

 ### Keywords
-* UDP QoS Bypass
-* UDP Blocking Bypass
-* OpenVPN TCP over TCP problem
-* OpenVPN over ICMP
-* UDP to ICMP tunnel
-* UDP to TCP tunnel
-* UDP over ICMP
-* UDP over TCP
+`Bypass UDP QoS` `Bypass UDP Blocking` `Bypass OpenVPN TCP over TCP problem` `OpenVPN over ICMP` `UDP to ICMP tunnel` `UDP to TCP tunnel` `UDP over ICMP` `UDP over TCP`

 # Getting Started
-### Prerequisites
-A Linux host (including desktop Linux, OpenWRT router, or Raspberry PI) with root access.
-
-If you want to use it on MICRO$OFT Windows, you can use VMware or Hyper-V (both bridged mode and <del>NAT mode</del> are supported).
-
 ### Installing
 Download binary release from https://github.com/wangyu-/udp2raw-tunnel/releases

@@ -56,20 +51,27 @@ Download binary release from https://github.com/wangyu-/udp2raw-tunnel/releases
 Assume your UDP is blocked or being QOS-ed or just poorly supported. Assume your server ip is 44.55.66.77, you have a service listening on udp port 7777.

 ```bash
-# Run at client side
-./udp2raw_amd64 -c -l0.0.0.0:3333  -r44.55.66.77:4096 -a -k "passwd" --raw-mode faketcp
-
 # Run at server side:
 ./udp2raw_amd64 -s -l0.0.0.0:4096 -r 127.0.0.1:7777  -a -k "passwd" --raw-mode faketcp
+
+# Run at client side
+./udp2raw_amd64 -c -l0.0.0.0:3333  -r44.55.66.77:4096 -a -k "passwd" --raw-mode faketcp
 ```
+###### Server Output:
+![](images/output_server.PNG)
+###### Client Output:
+![](images/output_client.PNG)

 Now,an encrypted raw tunnel has been established between client and server through TCP port 4096. Connecting to UDP port 3333 at the client side is equivalent to connecting to port 7777 at the server side. No UDP traffic will be exposed.

+### Note
+To run on Android, check [Android_Guide](/doc/android_guide.md)
+
 # Advanced Topic
 ### Usage
 ```
 udp2raw-tunnel
-version: Aug  5 2017 21:03:54
+version: Aug 18 2017 00:29:11
 repository: https://github.com/wangyu-/udp2raw-tunnel

 usage:
@@ -79,8 +81,8 @@ usage:
 common options,these options must be same on both side:
    --raw-mode            <string>        avaliable values:faketcp(default),udp,icmp
    -k,--key              <string>        password to gen symetric key,default:"secret key"
-    --auth-mode           <string>        avaliable values:aes128cbc(default),xor,none
-    --cipher-mode         <string>        avaliable values:md5(default),crc32,simple,none
+    --cipher-mode         <string>        avaliable values:aes128cbc(default),xor,none
+    --auth-mode           <string>        avaliable values:md5(default),crc32,simple,none
    -a,--auto-rule                        auto add (and delete) iptables rule
    -g,--gen-rule                         generate iptables rule then exit
    --disable-anti-replay                 disable anti-replay,not suggested
@@ -89,6 +91,7 @@ client options:
    --source-port         <port>          force source-port for raw socket,tcp/udp only
                                          this option disables port changing while re-connecting
 other options:
+    --conf-file         <string>        read options from a configuration file instead of command line
    --log-level           <number>        0:never    1:fatal   2:error   3:warn 
                                          4:info (default)     5:debug   6:trace
    --log-position                        enable file name,function name,line number in log
@@ -100,18 +103,51 @@ other options:
                                          0:dont increase
                                          1:increase every packet
                                          2:increase randomly, about every 3 packets (default)
+    --lower-level         <string>        send packet at OSI level 2, format:'if_name#dest_mac_adress'
+                                          ie:'eth0#00:23:45:67:89:b9'.Beta.
    -h,--help                             print this help message
 ```

-### IPTABLES rule
+### Iptables rules,`-a` and `-g`
 This program sends packets via raw socket. In FakeTCP mode, Linux kernel TCP packet processing has to be blocked by a iptables rule on both sides, otherwise the kernel will automatically send RST for an unrecongized TCP packet and you will sustain from stability / peformance problems. You can use `-a` option to let the program automatically add / delete iptables rule on start / exit. You can also use the `-g` option to generate iptables rule and add it manually.

-### `cipher-mode` and `auth-mode` 
+### `--cipher-mode` and `--auth-mode` 
 It is suggested to use `aes128cbc` + `md5` to obtain maximum security. If you want to run the program on a router, you can try `xor` + `simple`, which can fool packet inspection by firewalls the most of time, but it cannot protect you from serious attacks. Mode none is only for debugging purpose. It is not recommended to set the cipher-mode or auth-mode to none.

-### seq-mode
+### `--seq-mode`
 The FakeTCP mode does not behave 100% like a real tcp connection. ISPs may be able to distinguish the simulated tcp traffic from the real TCP traffic (though it's costly). seq-mode can help you change the seq increase behavior slightly. If you experience connection problems, try to change the value. 

+### `--conf-file`
+
+You can also load options from a configuration file in order to keep secrets away from ps command.
+
+For example, rewrite the options for the above `server` example (in Getting Started section) into configuration file:
+
+`server.conf`
+
+```
+-s
+# You can add comments like this
+# Comments MUST occupy an entire line
+# Or they will not work as expected
+# Listen address
+-l 0.0.0.0:4096
+# Remote address
+-r 127.0.0.1:7777
+-a
+-k passwd
+--raw-mode faketcp
+```
+
+Pay attention to the `-k` parameter: In command line mode the quotes around the password will be removed by shell. In configuration files we do not remove quotes.
+
+Then start the server with
+
+```bash
+./udp2raw_amd64 --conf-file server.conf
+```
+
+
 # Peformance Test
 #### Test method:
 iperf3 TCP via OpenVPN + udp2raw 
@@ -141,20 +177,22 @@ raw_mode: faketcp  cipher_mode: aes128cbc  auth_mode: md5
 (reverse speed was simliar and not uploaded)

 # Application
-### tunneling any traffic via raw traffic by using udp2raw +openvpn
+## Tunneling any traffic via raw traffic by using udp2raw +openvpn
 ![image_vpn](images/openvpn.PNG)
-1. bypasses UDP block/UDP QOS
+1. Bypasses UDP block/UDP QOS

-2. no TCP ovr tcp problem (tcp over tcp problem http://sites.inka.de/bigred/devel/tcp-tcp.html ,https://community.openvpn.net/openvpn/ticket/2 )
+2. No TCP over TCP problem (TCP over TCP problem http://sites.inka.de/bigred/devel/tcp-tcp.html ,https://community.openvpn.net/openvpn/ticket/2 )

-3. openvpn over icmp also becomes a choice
+3. OpenVpn over ICMP also becomes a choice

-more details at [openvpn+udp2raw_guide](/doc/openvpn_guide.md)
-### speed-up tcp connection via raw traffic by using udp2raw+kcptun
+4. Supports almost any UDP-based VPN
+
+More details at [openvpn+udp2raw_guide](/doc/openvpn_guide.md)
+## Speed-up tcp connection via raw traffic by using udp2raw+kcptun
 kcptun is a tcp connection speed-up program,it speeds-up tcp connection by using kcp protocol on-top of udp.by using udp2raw,you can use kcptun while udp is QoSed or blocked.
 (kcptun, https://github.com/xtaci/kcptun)

-### speed-up tcp connection via raw traffic by using udp2raw+finalspeed
+## Speed-up tcp connection via raw traffic by using udp2raw+finalspeed
 finalspeed is a tcp connection speed-up program similiar to kcptun,it speeds-up tcp connection by using kcp protocol on-top of udp or tcp.but its tcp mode doesnt support openvz,you can bypass this problem if you use udp2raw+finalspeed together,and icmp mode also becomes avaliable.

 # How to build
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -13,7 +13,7 @@ set(SOURCE_FILES
        main.cpp
        network.cpp
        )
-set(CMAKE_CXX_FLAGS "-Wall -Wextra -Wno-unused-variable -Wno-unused-parameter -static")
+set(CMAKE_CXX_FLAGS "-Wall -Wextra -Wno-unused-variable -Wno-unused-parameter -Wno-missing-field-initializers  -static")
 #set(CMAKE_LINK_LIBRARY_FLAG "-lrt")
-add_executable(udp2raw_tunnel ${SOURCE_FILES})
-target_link_libraries(udp2raw_tunnel rt)
+add_executable(udp2raw_cmake ${SOURCE_FILES})
+target_link_libraries(udp2raw_cmake rt)
--- a/common.cpp
+++ b/common.cpp
@@ -15,8 +15,16 @@ int about_to_exit=0;
 raw_mode_t raw_mode=mode_faketcp;
 unordered_map<int, const char*> raw_mode_tostring = {{mode_faketcp, "faketcp"}, {mode_udp, "udp"}, {mode_icmp, "icmp"}};
 int socket_buf_size=1024*1024;
+
 static int random_number_fd=-1;
-char iptables_rule[200]="";
+string iptables_pattern="";
+int iptables_rule_added=0;
+int iptables_rule_keeped=0;
+int iptables_rule_keep_index=0;
+//int iptables_rule_no_clear=0;
+
+
+
 program_mode_t program_mode=unset_mode;//0 unset; 1client 2server

 u64_t get_current_time()
@@ -50,38 +58,161 @@ char * my_ntoa(u32_t ip)
 }


-int add_iptables_rule(char * s)
+/*
+int add_iptables_rule(const char * s)
 {
-	strcpy(iptables_rule,s);
-	char buf[300]="iptables -I ";
-	strcat(buf,s);
-	if(system(buf)==0)
+
+	iptables_pattern=s;
+
+	string rule="iptables -I INPUT ";
+	rule+=iptables_pattern;
+	rule+=" -j DROP";
+
+	char *output;
+	if(run_command(rule.c_str(),output)==0)
 	{
-		mylog(log_warn,"auto added iptables rule by:  %s\n",buf);
+		mylog(log_warn,"auto added iptables rule by:  %s\n",rule.c_str());
 	}
 	else
 	{
-		mylog(log_fatal,"auto added iptables failed by: %s\n",buf);
+		mylog(log_fatal,"auto added iptables failed by: %s\n",rule.c_str());
+		//mylog(log_fatal,"reason : %s\n",strerror(errno));
 		myexit(-1);
 	}
+	iptables_rule_added=1;
+	return 0;
+}*/
+string chain[2];
+string rule_keep[2];
+string rule_keep_add[2];
+string rule_keep_del[2];
+u64_t keep_rule_last_time=0;
+
+pthread_t keep_thread;
+int keep_thread_running=0;
+int iptables_gen_add(const char * s,u32_t const_id)
+{
+	string dummy="";
+	iptables_pattern=s;
+	chain[0] =dummy+ "udp2rawDwrW_C";
+	rule_keep[0]=dummy+ iptables_pattern+" -j " +chain[0];
+	rule_keep_add[0]=dummy+"iptables -I INPUT "+rule_keep[0];
+
+	char *output;
+	run_command(dummy+"iptables -N "+chain[0],output,show_none);
+	run_command(dummy+"iptables -F "+chain[0],output);
+	run_command(dummy+"iptables -I "+chain[0] + " -j DROP",output);
+
+	rule_keep_del[0]=dummy+"iptables -D INPUT "+rule_keep[0];
+
+	run_command(rule_keep_del[0],output,show_none);
+	run_command(rule_keep_del[0],output,show_none);
+
+	if(run_command(rule_keep_add[0],output)!=0)
+	{
+		mylog(log_fatal,"auto added iptables failed by: %s\n",rule_keep_add[0].c_str());
+		myexit(-1);
+	}
+	return 0;
+}
+int iptables_rule_init(const char * s,u32_t const_id,int keep)
+{
+	iptables_pattern=s;
+	iptables_rule_added=1;
+	iptables_rule_keeped=keep;
+
+	string dummy="";
+	char const_id_str[100];
+	sprintf(const_id_str, "%x", const_id);
+
+	chain[0] =dummy+ "udp2rawDwrW_"+const_id_str+"_C0";
+	chain[1] =dummy+ "udp2rawDwrW_"+const_id_str+"_C1";
+
+	rule_keep[0]=dummy+ iptables_pattern+" -j " +chain[0];
+	rule_keep[1]=dummy+ iptables_pattern+" -j " +chain[1];
+
+	rule_keep_add[0]=dummy+"iptables -I INPUT "+rule_keep[0];
+	rule_keep_add[1]=dummy+"iptables -I INPUT "+rule_keep[1];
+
+	rule_keep_del[0]=dummy+"iptables -D INPUT "+rule_keep[0];
+	rule_keep_del[1]=dummy+"iptables -D INPUT "+rule_keep[1];
+
+	keep_rule_last_time=get_current_time();
+
+	char *output;
+
+	for(int i=0;i<=iptables_rule_keeped;i++)
+	{
+		run_command(dummy+"iptables -N "+chain[i],output);
+		run_command(dummy+"iptables -F "+chain[i],output);
+		run_command(dummy+"iptables -I "+chain[i] + " -j DROP",output);
+
+		if(run_command(rule_keep_add[i],output)!=0)
+		{
+			mylog(log_fatal,"auto added iptables failed by: %s\n",rule_keep_add[i].c_str());
+			myexit(-1);
+		}
+	}
+	mylog(log_warn,"auto added iptables rules\n");
+	return 0;
+}
+
+int keep_iptables_rule()  //magic to work on a machine without grep/iptables --check/-m commment
+{
+	/*
+	if(iptables_rule_keeped==0) return  0;
+
+
+	uint64_t tmp_current_time=get_current_time();
+	if(tmp_current_time-keep_rule_last_time<=iptables_rule_keep_interval)
+	{
+		return 0;
+	}
+	else
+	{
+		keep_rule_last_time=tmp_current_time;
+	}*/
+
+	mylog(log_debug,"keep_iptables_rule begin %llu\n",get_current_time());
+	iptables_rule_keep_index+=1;
+	iptables_rule_keep_index%=2;
+
+	string dummy="";
+	char *output;
+
+	int i=iptables_rule_keep_index;
+
+	run_command(dummy + "iptables -N " + chain[i], output,show_none);
+
+	if (run_command(dummy + "iptables -F " + chain[i], output,show_none) != 0)
+		mylog(log_warn, "iptables -F failed %d\n",i);
+
+	if (run_command(dummy + "iptables -I " + chain[i] + " -j DROP",output,show_none) != 0)
+		mylog(log_warn, "iptables -I failed %d\n",i);
+
+	if (run_command(rule_keep_del[i], output,show_none) != 0)
+		mylog(log_warn, "rule_keep_del failed %d\n",i);
+
+	run_command(rule_keep_del[i], output,show_none); //do it twice,incase it fails for unknown random reason
+
+	if(run_command(rule_keep_add[i], output,show_log)!=0)
+		mylog(log_warn, "rule_keep_del failed %d\n",i);
+
+	mylog(log_debug,"keep_iptables_rule end %llu\n",get_current_time());
 	return 0;
 }

 int clear_iptables_rule()
 {
-	if(iptables_rule[0]!=0)
-	{
-		char buf[300]="iptables -D ";
-		strcat(buf,iptables_rule);
-		if(system(buf)==0)
-		{
-			mylog(log_warn,"iptables rule cleared by: %s \n",buf);
-		}
-		else
-		{
-			mylog(log_error,"clear iptables failed by: %s\n",buf);
-		}
+	char *output;
+	string dummy="";
+	if(!iptables_rule_added) return 0;

+	for(int i=0;i<=iptables_rule_keeped;i++ )
+	{
+		run_command(rule_keep_del[i],output);
+		run_command(dummy+"iptables -F "+chain[i],output);
+		run_command(dummy+"iptables -X "+chain[i],output);
 	}
 	return 0;
 }
@@ -135,7 +266,7 @@ u64_t ntoh64(u64_t a)
 {
 	if(__BYTE_ORDER == __LITTLE_ENDIAN)
 	{
-		return __bswap_64( a);
+		return bswap_64( a);
 	}
 	else return a;

@@ -144,7 +275,7 @@ u64_t hton64(u64_t a)
 {
 	if(__BYTE_ORDER == __LITTLE_ENDIAN)
 	{
-		return __bswap_64( a);
+		return bswap_64( a);
 	}
 	else return a;

@@ -212,8 +343,19 @@ int set_buf_size(int fd)
 void myexit(int a)
 {
    if(enable_log_color)
-   	 printf("%s\n",RESET);
-    clear_iptables_rule();
+   	printf("%s\n",RESET);
+    if(keep_thread_running)
+    {
+		if(pthread_cancel(keep_thread))
+		{
+			mylog(log_warn,"pthread_cancel failed\n");
+		}
+		else
+		{
+			mylog(log_info,"pthread_cancel success\n");
+		}
+    }
+	clear_iptables_rule();
 	exit(a);
 }
 void  signal_handler(int sig)
@@ -248,7 +390,31 @@ int char_to_numbers(const char * data,int len,id_t &id1,id_t &id2,id_t &id3)
 	id3=ntohl(  *((id_t*)(data+sizeof(id_t)*2)) );
 	return 0;
 }
-
+int hex_to_u32(const string & a,u32_t &output)
+{
+	//string b="0x";
+	//b+=a;
+	if(sscanf(a.c_str(),"%x",&output)==1)
+	{
+		//printf("%s %x\n",a.c_str(),output);
+		return 0;
+	}
+	mylog(log_error,"<%s> doesnt contain a hex\n",a.c_str());
+	return -1;
+}
+int hex_to_u32_with_endian(const string & a,u32_t &output)
+{
+	//string b="0x";
+	//b+=a;
+	if(sscanf(a.c_str(),"%x",&output)==1)
+	{
+		output=htonl(output);
+		//printf("%s %x\n",a.c_str(),output);
+		return 0;
+	}
+	mylog(log_error,"<%s> doesnt contain a hex\n",a.c_str());
+	return -1;
+}
 bool larger_than_u32(u32_t a,u32_t b)
 {

@@ -310,3 +476,254 @@ bool larger_than_u16(uint16_t a,uint16_t b)
 		}
 	}
 }
+vector<string> string_to_vec(const char * s,const char * sp) {
+	  vector<string> res;
+	  string str=s;
+	  char *p = strtok ((char *)str.c_str(),sp);
+	  while (p != NULL)
+	  {
+		 res.push_back(p);
+	    //printf ("%s\n",p);
+	    p = strtok(NULL, sp);
+	  }
+
+	 /* for(int i=0;i<(int)res.size();i++)
+	  {
+		  printf("<<%s>>\n",res[i].c_str());
+	  }*/
+	  return res;
+}
+
+vector< vector <string> > string_to_vec2(const char * s)
+{
+	vector< vector <string> > res;
+	vector<string> lines=string_to_vec(s,"\n");
+	for(int i=0;i<int(lines.size());i++)
+	{
+		vector<string> tmp;
+		tmp=string_to_vec(lines[i].c_str(),"\t ");
+		res.push_back(tmp);
+	}
+	return res;
+}
+int read_file(const char * file,string &output)
+{
+	const int max_len=3*1024*1024;
+   // static char buf[max_len+100];
+	string buf0;
+	buf0.reserve(max_len+200);
+	char * buf=(char *)buf0.c_str();
+	buf[max_len]=0;
+    //buf[sizeof(buf)-1]=0;
+	int fd=open(file,O_RDONLY);
+	if(fd==-1)
+	{
+		 mylog(log_error,"read_file %s fail\n",file);
+		 return -1;
+	}
+	int len=read(fd,buf,max_len);
+	if(len==max_len)
+	{
+		buf[0]=0;
+        mylog(log_error,"%s too long,buf not large enough\n",file);
+        return -2;
+	}
+	else if(len<0)
+	{
+		buf[0]=0;
+        mylog(log_error,"%s read fail %d\n",file,len);
+        return -3;
+	}
+	else
+	{
+		buf[len]=0;
+		output=buf;
+	}
+	return 0;
+}
+int run_command(string command0,char * &output,int flag) {
+    FILE *in;
+
+
+    if((flag&show_log)==0) command0+=" 2>&1 ";
+
+    const char * command=command0.c_str();
+
+    int level= (flag&show_log)?log_warn:log_debug;
+
+    if(flag&show_command)
+    {
+    	mylog(log_info,"run_command %s\n",command);
+    }
+    else
+    {
+    	mylog(log_debug,"run_command %s\n",command);
+    }
+    static __thread char buf[1024*1024+100];
+    buf[sizeof(buf)-1]=0;
+    if(!(in = popen(command, "r"))){
+        mylog(level,"command %s popen failed,errno %s\n",command,strerror(errno));
+        return -1;
+    }
+
+    int len =fread(buf, 1024*1024, 1, in);
+    if(len==1024*1024)
+    {
+    	buf[0]=0;
+        mylog(level,"too long,buf not larger enough\n");
+        return -2;
+    }
+    else
+    {
+       	buf[len]=0;
+    }
+    int ret;
+    if(( ret=ferror(in) ))
+    {
+        mylog(level,"command %s fread failed,ferror return value %d \n",command,ret);
+        return -3;
+    }
+    //if(output!=0)
+    output=buf;
+    ret= pclose(in);
+
+    int ret2=WEXITSTATUS(ret);
+
+    if(ret!=0||ret2!=0)
+    {
+    	mylog(level,"commnad %s ,pclose returned %d ,WEXITSTATUS %d,errnor :%s \n",command,ret,ret2,strerror(errno));
+    	return -4;
+    }
+
+    return 0;
+
+}
+/*
+int run_command_no_log(string command0,char * &output) {
+    FILE *in;
+    command0+=" 2>&1 ";
+    const char * command=command0.c_str();
+    mylog(log_debug,"run_command_no_log %s\n",command);
+    static char buf[1024*1024+100];
+    buf[sizeof(buf)-1]=0;
+    if(!(in = popen(command, "r"))){
+        mylog(log_debug,"command %s popen failed,errno %s\n",command,strerror(errno));
+        return -1;
+    }
+
+    int len =fread(buf, 1024*1024, 1, in);
+    if(len==1024*1024)
+    {
+    	buf[0]=0;
+        mylog(log_debug,"too long,buf not larger enough\n");
+        return -2;
+    }
+    else
+    {
+       	buf[len]=0;
+    }
+    int ret;
+    if(( ret=ferror(in) ))
+    {
+        mylog(log_debug,"command %s fread failed,ferror return value %d \n",command,ret);
+        return -3;
+    }
+    //if(output!=0)
+    output=buf;
+    ret= pclose(in);
+
+    int ret2=WEXITSTATUS(ret);
+
+    if(ret!=0||ret2!=0)
+    {
+    	mylog(log_debug,"commnad %s ,pclose returned %d ,WEXITSTATUS %d,errnor :%s \n",command,ret,ret2,strerror(errno));
+    	return -4;
+    }
+
+    return 0;
+
+}*/
+
+// Remove preceding and trailing characters
+string trim(const string& str, char c) {
+	size_t first = str.find_first_not_of(c);
+	if(string::npos==first)
+	{
+		return "";
+	}
+	size_t last = str.find_last_not_of(c);
+	return str.substr(first,(last-first+1));
+}
+
+vector<string> parse_conf_line(const string& s0)
+{
+	string s=s0;
+	s.reserve(s.length()+200);
+	char *buf=(char *)s.c_str();
+	//char buf[s.length()+200];
+	char *p=buf;
+	int i=int(s.length())-1;
+	int j;
+	vector<string>res;
+	strcpy(buf,(char *)s.c_str());
+	while(i>=0)
+	{
+		if(buf[i]==' ' || buf[i]== '\t')
+			buf[i]=0;
+		else break;
+		i--;
+	}
+	while(*p!=0)
+	{
+		if(*p==' ' || *p== '\t')
+		{
+			p++;
+		}
+		else break;
+	}
+	int new_len=strlen(p);
+	if(new_len==0)return res;
+	if(p[0]=='#') return res;
+	if(p[0]!='-')
+	{
+		mylog(log_fatal,"line :<%s> not begin with '-' ",s.c_str());
+		myexit(-1);
+	}
+
+	for(i=0;i<new_len;i++)
+	{
+		if(p[i]==' '||p[i]=='\t')
+		{
+			break;
+		}
+	}
+	if(i==new_len)
+	{
+		res.push_back(p);
+		return res;
+	}
+
+	j=i;
+	while(p[j]==' '||p[j]=='\t')
+		j++;
+	p[i]=0;
+	res.push_back(p);
+	res.push_back(p+j);
+	return res;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/common.h
+++ b/common.h
@@ -43,11 +43,12 @@
 #include <stdarg.h>
 #include <assert.h>
 #include <linux/if_packet.h>
-
-
-
+#include <byteswap.h>
+#include <pthread.h>

 #include<unordered_map>
+#include<vector>
+#include<string>
 using  namespace std;


@@ -94,7 +95,11 @@ const u32_t client_conn_uplink_timeout=client_conn_timeout+2000;
 //const uint32_t server_conn_timeout=conv_timeout+60000;//this should be 60s+ longer than conv_timeout,so that conv_manager can destruct convs gradually,to avoid latency glicth
 const u32_t server_conn_timeout=conv_timeout+10000;//for test

+//const u32_t iptables_rule_keep_interval=4000;
+
 extern int about_to_exit;
+extern pthread_t keep_thread;
+extern int keep_thread_running;

 enum raw_mode_t{mode_faketcp=0,mode_udp,mode_icmp,mode_end};
 extern raw_mode_t raw_mode;
@@ -140,8 +145,33 @@ int char_to_numbers(const char * data,int len,id_t &id1,id_t &id2,id_t &id3);

 void myexit(int a);

-int add_iptables_rule(char *);
+int add_iptables_rule(const char *);

 int clear_iptables_rule();

+int iptables_gen_add(const char * s,u32_t const_id);
+int iptables_rule_init(const char * s,u32_t const_id,int keep);
+int keep_iptables_rule();
+
+const int show_none=0;
+const int show_command=0x1;
+const int show_log=0x2;
+const int show_all=show_command|show_log;
+int run_command(string command,char * &output,int flag=show_all);
+//int run_command_no_log(string command,char * &output);
+int read_file(const char * file,string &output);
+
+vector<string> string_to_vec(const char * s,const char * sp);
+vector< vector <string> > string_to_vec2(const char * s);
+
+string trim(const string& str, char c);
+
+string trim_conf_line(const string& str);
+
+vector<string> parse_conf_line(const string& s);
+
+int hex_to_u32_with_endian(const string & a,u32_t &output);
+int hex_to_u32(const string & a,u32_t &output);
+//extern string iptables_pattern;
+
 #endif /* COMMON_H_ */
--- a/config.example
+++ b/config.example
@@ -0,0 +1,16 @@
+# Basically this file is the equivalent to splitting the command line options into multiple lines
+# Each line should contain an option
+
+# This is client
+-c
+# Or use -s if you use it on server side
+# Define local address
+-l 127.0.0.1:56789
+# Define remote address
+-r 45.66.77.88:45678
+# Password
+-k my_awesome_password
+# Mode
+--raw-mode faketcp
+# Log Level
+--log-level 4
--- a/doc/README.zh-cn.md
+++ b/doc/README.zh-cn.md
@@ -1,6 +1,8 @@
 Udp2raw-tunnel 
 ![image2](/images/image2.PNG)
-udp2raw tunnel，通过raw socket给UDP包加上TCP或ICMP header，进而绕过UDP屏蔽或QoS，或在UDP不稳定的环境下提升稳定性。支持心跳保活、自动重连，重连后会恢复上次连接，在底层掉线的情况下可以保持上层不掉线。同时有加密、防重放攻击、信道复用的功能。
+udp2raw tunnel，通过raw socket给UDP包加上TCP或ICMP header，进而绕过UDP屏蔽或QoS，或在UDP不稳定的环境下提升稳定性。可以有效防止在使用kcptun或者finalspeed的情况下udp端口被运营商限速。
+
+支持心跳保活、自动重连，重连后会恢复上次连接，在底层掉线的情况下可以保持上层不掉线。同时有加密、防重放攻击、信道复用的功能。

 **欢迎任何形式的转载**

@@ -13,6 +15,12 @@ udp2raw tunnel，通过raw socket给UDP包加上TCP或ICMP header，进而绕过
 如果你需要加速跨国网游、网页浏览，解决方案在另一个repo：

 https://github.com/wangyu-/UDPspeeder
+# 支持的平台
+Linux主机，有root权限。可以是PC、android手机/平板、openwrt路由器、树莓派。主机上最好安装了iptables命令(apt/yum很容易安装)。
+
+在windows和mac上预装了udp2raw的虚拟机镜像已发布，可以用Vmware或VirtualBox加载，容量4.4mb，已经配置好了自动获取网卡ip，开机即用，稳定，性能很好。
+（udp2raw跑在虚拟机里，其他应用照常跑在windows上）（确保虚拟机网卡工作在桥接模式）（Vmware player 75mb,VirtualBox 118mb,很容易安装）。
+
 # 功能特性
 ### 把udp流量伪装成tcp /icmp
 用raw socket给udp包加上tcp/icmp包头，可以突破udp流量限制或Udp QOS。或者在udp nat有问题的环境下，提升稳定性。  另外也支持用raw 发udp包，这样流量不会被伪装，只会被加密。
@@ -27,10 +35,10 @@ Client能用单倍的超时时间检测到单向链路的失效，不管是上

 对于有大量client的情况，对于不同client,server发送的心跳是错开时间发送的，不会因为短时间发送大量的心跳而造成拥塞和延迟抖动。

-### 加密 防重放攻击 防中间人攻击
+### 加密 防重放攻击
 用aes128cbc加密，md5/crc32做数据完整校验。用类似ipsec/openvpn的 replay window机制来防止重放攻击。

-设计目标是，即使攻击者可以监听到tunnel的所有包，可以选择性丢弃tunnel的任意包，可以重放任意包；攻击者也没办法获得tunnel承载的任何数据，也没办法向tunnel的数据流中通过包构造/包重放插入任何数据。client和server互相认证对方，无法被中间人攻击。
+设计目标是，即使攻击者可以监听到tunnel的所有包，可以选择性丢弃tunnel的任意包，可以重放任意包；攻击者也没办法获得tunnel承载的任何数据，也没办法向tunnel的数据流中通过包构造/包重放插入任何数据。

 ### 其他特性
 信道复用，client的udp端支持多个连接。
@@ -50,9 +58,6 @@ epoll纯异步，高并发，除了回收过期连接外，所有操作的时间

 # 简明操作说明

-### 环境要求
-Linux主机，有root权限。主机上最好安装了iptables命令(apt/yum很容易安装)。在windows和mac上可以开虚拟机（udp2raw跑在linux里，其他应用照常跑在windows上，桥接模式测试可用）。
-
 ### 安装
 下载编译好的二进制文件，解压到任意目录。

@@ -62,22 +67,28 @@ https://github.com/wangyu-/udp2raw-tunnel/releases
 假设你有一个server，ip为44.55.66.77，有一个服务监听在udp 7777端口。 假设你本地的主机到44.55.66.77的UDP流量被屏蔽了，或者被qos了

 ```
-在client端运行:
-./udp2raw_amd64 -c -l0.0.0.0:3333  -r44.55.66.77:4096 -a -k "passwd" --raw-mode faketcp
-
 在server端运行:
 ./udp2raw_amd64 -s -l0.0.0.0:4096 -r 127.0.0.1:7777  -a -k "passwd" --raw-mode faketcp

+在client端运行:
+./udp2raw_amd64 -c -l0.0.0.0:3333  -r44.55.66.77:4096 -a -k "passwd" --raw-mode faketcp
 ```
+###### Server端输出:
+![](/images/output_server.PNG)
+###### Client端输出:
+![](/images/output_client.PNG)

 现在client和server之间建立起了，tunnel。想要在本地连接44.55.66.77:7777，只需要连接 127.0.0.1:3333。来回的所有的udp流量会被经过tunneling发送。在外界看起来是tcp流量，不会有udp流量暴露到公网。

+### 提醒
+如果要在anroid上运行，请看[Android简明教程](/doc/android_guide.md)
+
 # 进阶操作说明

 ### 命令选项
 ```
 udp2raw-tunnel
-version: Aug  5 2017 21:03:54
+version: Aug 18 2017 00:29:11
 repository: https://github.com/wangyu-/udp2raw-tunnel

 usage:
@@ -87,8 +98,8 @@ usage:
 common options,these options must be same on both side:
    --raw-mode            <string>        avaliable values:faketcp(default),udp,icmp
    -k,--key              <string>        password to gen symetric key,default:"secret key"
-    --auth-mode           <string>        avaliable values:aes128cbc(default),xor,none
-    --cipher-mode         <string>        avaliable values:md5(default),crc32,simple,none
+    --cipher-mode         <string>        avaliable values:aes128cbc(default),xor,none
+    --auth-mode           <string>        avaliable values:md5(default),crc32,simple,none
    -a,--auto-rule                        auto add (and delete) iptables rule
    -g,--gen-rule                         generate iptables rule then exit
    --disable-anti-replay                 disable anti-replay,not suggested
@@ -108,18 +119,71 @@ other options:
                                          0:dont increase
                                          1:increase every packet
                                          2:increase randomly, about every 3 packets (default)
+    --lower-level         <string>        send packet at OSI level 2, format:'if_name#dest_mac_adress'
+                                          ie:'eth0#00:23:45:67:89:b9'.Beta.
    -h,--help                             print this help message
 ```
-### iptables 规则
+
+### iptables 规则,`-a`和`-g`
 用raw收发tcp包本质上绕过了linux内核的tcp协议栈。linux碰到raw socket发来的包会不认识，如果一直收到不认识的包，会回复大量RST，造成不稳定或性能问题。所以强烈建议添加iptables规则屏蔽Linux内核的对指定端口的处理。用-a选项，udp2raw会在启动的时候自动帮你加上Iptables规则，退出的时候再自动删掉。如果长期使用，可以用-g选项来生成相应的Iptables规则再自己手动添加，这样规则不会在udp2raw退出时被删掉，可以避免停掉udp2raw后内核向对端回复RST。

 用raw收发udp包也类似，只是内核回复的是icmp unreachable。而用raw 收发icmp，内核会自动回复icmp echo。都需要相应的iptables规则。
-### cipher-mode 和 auth-mode 
+### `--cipher-mode` 和 `--auth-mode` 
 如果要最大的安全性建议用aes128cbc+md5。如果要运行再路由器上，建议xor+simple。但是注意xor+simple只能骗过防火墙的包检测，不能防止真正的攻击者。

-### seq-mode
+### `--seq-mode`
 facktcp模式并没有模拟tcp的全部。所以理论上有办法把faketcp和真正的tcp流量区分开来（虽然大部分ISP不太可能做这种程度的包检测）。seq-mode可以改变一些seq ack的行为。如果遇到了连接问题，可以尝试更改。在我这边的移动线路用3种模式都没问题。

+### `--lower-level`
+大部分udp2raw不能连通的情况都是设置了不兼容的iptables造成的。--lower-level选项允许绕过本地iptables。在一些iptables不好改动的情况下尤其有效（比如你用的是梅林固件，iptables全是固件自己生成的）。
+
+##### 格式
+`if_name#dest_mac_adress`,例如 `eth0#00:23:45:67:89:b9` 。`eth0`换成你的出口网卡名。`00:23:45:67:89:b9`换成网关的mac地址（如果client和server在同一个局域网内，可能不需要网关，这时候直接用对方主机的mac地址，这个属于罕见的应用场景，可以忽略）。
+
+##### client端获得--lower-level参数的办法
+在client 端，运行`traceroute <server_ip>`，记下第一跳的地址，这个就是`网关ip`。再运行`arp -s <网关ip>`，可以同时查到出口网卡名和mac。
+
+![](/images/lower_level.PNG)
+##### server端获得--lower-level参数的办法
+如果client有公网ip，就`traceroute <client_ip>`。下一步和client端的方法一样。
+
+如果client没有公网ip，就`traceroute google.com` 或`traceroute baidu.com`。下一步和client端的方法一样。
+
+server端也可以用`--lower-level auto` 来尝试自动获得参数，如果无法连接再手动填写。
+
+##### 注意
+如果用了`--lower-level`选项。server虽然还可以bind在0.0.0.0，但是因为你显式指定了网络接口，就只能工作在这一个网络接口了。
+
+如果`arps -s`命令查询不到，首先再试几次。如果还是查询不到，那么可能是因为你用的是pppoe方式的拨号宽带，查询不到是正常的。这种情况下`if_name`填pppoe产生的虚拟interface，通常名字叫`pppXXXX`，从`ifconfig`命令的输出里找一下；`des_mac_adress`填`00:00:00:00:00:00`,例如`ppp0#00:00:00:00:00:00`
+
+### `--conf-file`
+
+为了避免将密码等私密信息暴露在进程命令行参数内，你也可以使用 `配置文件` 来存储参数。
+
+比如，将以上服务端参数改写成配置文件
+
+`server.conf`:
+
+```
+-s
+# 你可以像这样添加注释
+# 注意，只有整行注释才能在配置文件里使用
+# 注释必须独占一行
+-l 0.0.0.0:4096
+-r 127.0.0.1:7777
+-a
+-k passwd
+--raw-mode faketcp
+```
+
+注意，当写入配置文件的时候，密码等参数两边的引号必须去除。
+
+然后就可以使用下面的方式启动服务端
+
+```bash
+./udp2raw_amd64 --conf-file server.conf
+```
+
 # 性能测试
 iperf3 的UDP模式有BUG，所以，这里用iperf3的tcp模式，配合Openvpn，测试udp2raw的性能。（iperf3 udp issue ,https://github.com/esnet/iperf/issues/296 ）

--- a/doc/android_guide.md
+++ b/doc/android_guide.md
@@ -0,0 +1,29 @@
+# How to run udp2raw on a rooted android device(arm cpu)
+
+There is currently no GUI for udp2raw on android.Make sure you have installed Terminal to run it.
+
+Download udp2raw_arm from https://github.com/wangyu-/udp2raw-tunnel/releases.
+
+Copy udp2raw_arm to any dir of your **internal storage** .Copying it to **SD card wont work**.
+
+# Steps
+1.  run udp2raw_arm  as usual, except you must change the -a option to -g
+```
+./udp2raw_arm -c -r 44.55.66.77:9966 -l 0.0.0.0:4000 -k1234 --cipher xor -g
+```
+
+2. find the generated iptables rule from udp2raw's output,add it manually by running:
+```
+iptables -I INPUT -s 44.55.66.77/32 -p tcp -m tcp --sport 9966 -j DROP
+```
+
+3. run udp2raw_ram without -g command
+
+```
+./udp2raw_arm -c -r 44.55.66.77:9966 -l 0.0.0.0:4000 -k1234 --cipher xor 
+```
+
+# ScreenShot 
+zoom-in if not large enough
+
+![](/images/android.png)
--- a/doc/build_guide.md
+++ b/doc/build_guide.md
@@ -71,4 +71,6 @@ modify first line of makefile to:
 cc_cross=/home/wangyu/Desktop/OpenWrt-SDK-15.05-ar71xx-generic_gcc-4.8-linaro_uClibc-0.9.33.2.Linux-x86_64/staging_dir/toolchain-mips_34kc_gcc-4.8-linaro_uClibc-0.9.33.2/bin/mips-openwrt-linux-g++
 ```

-run 'make cross'，the just generated udp2raw_cross is the binary,compile done. copy it to your router to run.
+run `make cross`，the just generated `udp2raw_cross` is the binary,compile done. copy it to your router to run.
+
+`make cross` generates non-static binary. If you have any problem on running it,try to compile a static binary by using `make cross2` or `make cross3`.If your toolchain supports static compiling, usually one of them will succeed. The generated file is still named `udp2raw_cross`.
--- a/doc/build_guide.zh-cn.md
+++ b/doc/build_guide.zh-cn.md
@@ -69,4 +69,6 @@ http://downloads.openwrt.org/chaos_calmer/15.05/ar71xx/generic/OpenWrt-SDK-15.05
 cc_cross=/home/wangyu/Desktop/OpenWrt-SDK-15.05-ar71xx-generic_gcc-4.8-linaro_uClibc-0.9.33.2.Linux-x86_64/staging_dir/toolchain-mips_34kc_gcc-4.8-linaro_uClibc-0.9.33.2/bin/mips-openwrt-linux-g++
 ```

-执行make cross，目录下生成udp2raw_cross文件。编译完成。
+执行`make cross`，目录下生成udp2raw_cross文件。编译完成。
+
+`make cross`编译出的binary是非静态的。如果运行有问题，可以尝试用`make cross2`或`make cross3`编译静态的binary,你的工具链必须带静态库才能成功编译,生成的文件仍然叫udp2raw_cross.
--- a/doc/finalspeed_step_by_step.md
+++ b/doc/finalspeed_step_by_step.md
@@ -8,7 +8,7 @@
 ##### 摘要
 udp2raw是一个把udp流量通过raw socket包装成tcp流量的工具。通过用udp2raw配合udp模式的 finalspeed一样可以达到在底层发tcp包，绕过QOS的效果。支持openvz,稳定性也好很多。原理上相当于在finalspeed外面再包了一层tunnel。

-本教程会一步一步演示用udp2raw+kcptun加速http流量的过程。加速任何其他tcp流量也一样。
+本教程会一步一步演示用udp2raw+finalspeed加速http流量的过程。加速任何其他tcp流量也一样，包括ss。本文避免讨论科学上网，所以只演示加速http流量。

 udp2raw也支持把udp流量包装成Icmp发送，本教程不做演示。

@@ -17,12 +17,14 @@ udp2raw也支持把udp流量包装成Icmp发送，本教程不做演示。

 本地主机是windows,本地有openwrt路由器或树莓派或安装了linux虚拟机（网卡设置为桥接模式）。

-(如果嫌给虚拟机安装linux麻烦，可以下载别人提供好的linux虚拟机镜像，比如https://www.kali.org/downloads/ ，不过我没有测试过这个镜像,我用的是debian 7)
+(如果嫌给虚拟机安装linux麻烦，可以用release里发布的预装了udp2raw的openwrt_x86虚拟机镜像，容量4.4mb)
+
+下面的教程按虚拟机演示，如果你有openwrt路由器或树莓派，可以直接运行再路由器或树莓派上，就不需要虚拟机了。

 ### 安装
 下载好udp2raw的压缩包，解压分别解压到服务器和本地的虚拟机。

-https://github.com/xtaci/kcptun/releases
+https://github.com/wangyu-/udp2raw-tunnel/releases

 在服务器端安装好finalspeed服务端，在本地windows安装好finalspeed的客户端。服务端我以前是用91yun的一键安装脚本安装的，没装过的可以去网上搜一键安装脚本。

@@ -52,7 +54,7 @@ netstat -nlp|grep java

 记下红框中的ip,这是虚拟机的网卡ip

-在server端也会显示server_reay
+在server端也会显示server_ready
 ![image](finalspeed_step_by_step/Capture4.PNG)

 4.在本地windows,按图配置好finalspeed的客户端。注意，192.168.205.8改成你刚才记下来的IP，带宽也要按实际的填。传输协议要选UDP.
--- a/doc/kcptun_step_by_step.md
+++ b/doc/kcptun_step_by_step.md
@@ -4,7 +4,9 @@
 本教程会一步一步演示用udp2raw+kcptun加速SSH流量的过程。加速任何其他tcp流量也一样，包括ss；本文避免涉及科学上网，所以演示ssh。

 ### 环境要求
-两边的主机都是linux，有root权限。 可以是openwrt路由器或树莓派，windows上桥接模式的虚拟机也可用
+两边的主机都是linux，有root权限。 可以是openwrt路由器或树莓派，也可以是root了的android。
+
+(windows和mac可以用release里发布的预装了udp2raw的openwrt_x86虚拟机镜像，容量4.4mb,开机即用)


 ### 安装
--- a/doc/openvpn_guide.md
+++ b/doc/openvpn_guide.md
@@ -2,6 +2,7 @@
 ![image_vpn](/images/openvpn.PNG)

 ![image4](/images/image4.PNG)
+
 # udp2raw command
 #### run at server side
 ```
@@ -13,6 +14,8 @@ assume server ip is 45.66.77.88
 ./udp2raw_amd64 -s -l0.0.0.0:3333 -r 45.66.77.88:8855 -k "passwd" --raw-mode faketcp -a
 ```

+#### hint
+You can add `--cipher-mode xor` `--auth-mode simple` to **both** sides to obtain maximum performance(but poor security).

 # openvpn config

@@ -38,7 +41,7 @@ mute 20

 comp-lzo no
 cipher none      ##### disable openvpn 's cipher and auth for maxmized peformance. 
-auth none        ##### you can enable openvpn's cipher and auth,if you dont care about peformance,oryou dont trust udp2raw 's encryption
+auth none        ##### you can enable openvpn's cipher and auth,if you dont care about peformance,or you dont trust udp2raw 's encryption

 fragment 1200       ##### very important    you can turn it up a bit. but,the lower the safer
 mssfix 1200         ##### very important
@@ -80,7 +83,7 @@ mute 20

 comp-lzo no
 cipher none      ##### disable openvpn 's cipher and auth for maxmized peformance. 
-auth none        ##### you can enable openvpn's cipher and auth,if you dont care about peformance,oryou dont trust udp2raw 's encryption
+auth none        ##### you can enable openvpn's cipher and auth,if you dont care about peformance,or you dont trust udp2raw 's encryption

 fragment 1200       ##### very important    you can turn it up a bit. but,the lower the safer
 mssfix 1200         ##### very important
--- a/encrypt.cpp
+++ b/encrypt.cpp
@@ -12,14 +12,14 @@

 static int8_t zero_iv[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,   0,0,0,0};//this prog use zero iv,you should make sure first block of data contains a random/nonce data
 /****
- * important!
- * why zero iv + nonce first data block is secure?
+ * security of zero_iv + nonce first data block
 * https://crypto.stackexchange.com/questions/5421/using-cbc-with-a-fixed-iv-and-a-random-first-plaintext-block
 ****/
+
 unordered_map<int, const char *> auth_mode_tostring = {{auth_none, "none"}, {auth_md5, "md5"}, {auth_crc32, "crc32"},{auth_simple,"simple"}};
 unordered_map<int, const char *> cipher_mode_tostring={{cipher_none,"none"},{cipher_aes128cbc,"aes128cbc"},{cipher_xor,"xor"}};

-auth_mode_t auth_mode=auth_crc32;
+auth_mode_t auth_mode=auth_md5;
 cipher_mode_t cipher_mode=cipher_aes128cbc;


@@ -216,7 +216,7 @@ int auth_crc32_verify(const char *data,int &len)
 {
 	if(len<int(sizeof(unsigned int)))
 	{
-		mylog(log_debug,"auth_crc32_verify len<16\n");
+		mylog(log_debug,"auth_crc32_verify len<%d\n",int(sizeof(unsigned int)));
 		return -1;
 	}
 	unsigned int  ret=crc32h((unsigned char *)data,len-sizeof(unsigned int));
@@ -316,6 +316,7 @@ int my_encrypt(const char *data,char *output,int &len,char * key)
 	return 0;

 }
+
 int my_decrypt(const char *data,char *output,int &len,char * key)
 {
 	if(len<0) return -1;
@@ -327,99 +328,6 @@ int my_decrypt(const char *data,char *output,int &len,char * key)
 	return 0;
 }

-int my_encrypt_old(const char *data0,char *output,int &len,char * key)
-{
-	static const int disable_all=0;
-	static const int disable_aes=0;
-
-	char data[buf_len];
-	memcpy(data,data0,len);
-
-	if(disable_all)
-	{
-		memcpy(output,data,len);
-		return 0;
-	}
-
-	int ori_len=len;
-
-	len=len+16;//md5
-	len+=2;//length
-
-	if(len%16!=0)
-	{
-		len= (len/16)*16+16;
-	}
-
-	if(len>max_data_len) return -1;
-
-	data[len-16-2]= (unsigned char)( (uint16_t(ori_len))>>8);
-	data[len-16-1]=(unsigned char)( ((uint16_t(ori_len))<<8)>>8) ;
-
-
-	//printf("%d %d\n",data[len-16-2],data[len-16-1]);
-	md5((unsigned char *)data,len-16,(unsigned char *)(data+len-16));
-
-
-	if(disable_aes)
-	{
-		memcpy(output,data,len);
-
-	}
-	else
-	{
-		AES_CBC_encrypt_buffer((unsigned char *)output,(unsigned char *)data,len,(unsigned char *)key,(unsigned char *)zero_iv);
-		//it doesnt allow over lap
-	}
-
-
-	return 0;
-}
-int my_decrypt_old(const char *data0,char *output,int &len,char * key)
-{
-	static const int disable_all=0;
-	static const int disable_aes=0;
-
-	char data[buf_len];
-	memcpy(data,data0,len);
-
-	if(disable_all)
-	{
-		memcpy(output,data,len);
-		return 0;
-	}
-	uint8_t md5_res[16];
-	if(len>max_data_len) return -1;
-	if(len<32) return -1;
-	if(len%16 !=0) return -1;
-
-
-	if(disable_aes)
-	{
-		memcpy(output,data,len);
-	}
-	else
-	{
-		AES_CBC_decrypt_buffer((unsigned char *)output,(unsigned char *)data,len,(unsigned char *)key,(unsigned char *)zero_iv);
-	}
-
-
-	//printf("%d %d\n",data[len-16-2],data[len-16-1]);
-
-	//printf("<<%d>>",len);
-
-	md5((unsigned char *)output,len-16,(unsigned char *)md5_res);
-
-	if(memcmp(output+len-16,md5_res,16)!=0)
-	{
-		return -2;
-	}
-
-	len=((unsigned char)output[len-16-2])*256u+((unsigned char)output[len-16-1]);  //this may be broken because of sign
-
-	return 0;
-}
-
 int my_encrypt_pesudo_header(uint8_t *data,uint8_t *output,int &len,uint8_t * key,uint8_t *header,int hlen)
 {

--- a/images/android.png
+++ b/images/android.png
--- a/images/lower_level.PNG
+++ b/images/lower_level.PNG
--- a/images/output_client.PNG
+++ b/images/output_client.PNG
--- a/images/output_server.PNG
+++ b/images/output_server.PNG
--- a/images/speed_test.PNG
+++ b/images/speed_test.PNG
--- a/lib/aes_acc/aesacc.c
+++ b/lib/aes_acc/aesacc.c
@@ -0,0 +1,394 @@
+/*
+ * This file is adapted from PolarSSL 1.3.19 (GPL)
+ */
+
+#include "aesni.h"
+#include "aesarm.h"
+#include <stdint.h>
+#include <string.h>
+
+#if defined(AES256) && (AES256 == 1)
+#define AES_KEYSIZE 256
+#ifdef HAVE_AMD64
+  #define aeshw_setkey_enc aesni_setkey_enc_256
+#endif
+#elif defined(AES192) && (AES192 == 1)
+#define AES_KEYSIZE 192
+#ifdef HAVE_AMD64
+  #define aeshw_setkey_enc aesni_setkey_enc_192
+#endif
+#else
+#define AES_KEYSIZE 128
+#ifdef HAVE_AMD64
+  #define aeshw_setkey_enc aesni_setkey_enc_128
+#endif
+#endif
+
+#define AES_NR ((AES_KEYSIZE >> 5) + 6)
+#define AES_RKSIZE      272
+
+#ifdef HAVE_AMD64
+#define HAVE_HARDAES 1
+#define aeshw_supported aesni_supported
+#define aeshw_crypt_ecb aesni_crypt_ecb
+#define aeshw_inverse_key(a,b) aesni_inverse_key(a,b,AES_NR)
+#endif /* HAVE_AMD64 */
+
+#ifdef HAVE_ARM64
+#define HAVE_HARDAES 1
+#define aeshw_supported aesarm_supported
+#define aeshw_crypt_ecb aesarm_crypt_ecb
+
+#include "aesarm_table.h"
+
+#ifndef GET_UINT32_LE
+#define GET_UINT32_LE(n,b,i)                            \
+{                                                       \
+    (n) = ( (uint32_t) (b)[(i)    ]       )             \
+        | ( (uint32_t) (b)[(i) + 1] <<  8 )             \
+        | ( (uint32_t) (b)[(i) + 2] << 16 )             \
+        | ( (uint32_t) (b)[(i) + 3] << 24 );            \
+}
+#endif
+
+static void aeshw_setkey_enc(uint8_t *rk, const uint8_t *key)
+{
+    unsigned int i;
+    uint32_t *RK;
+
+    RK = (uint32_t *) rk;
+
+    for( i = 0; i < ( AES_KEYSIZE >> 5 ); i++ )
+    {
+        GET_UINT32_LE( RK[i], key, i << 2 );
+    }
+
+    switch( AES_NR )
+    {
+        case 10:
+
+            for( i = 0; i < 10; i++, RK += 4 )
+            {
+                RK[4]  = RK[0] ^ RCON[i] ^
+                ( (uint32_t) FSb[ ( RK[3] >>  8 ) & 0xFF ]       ) ^
+                ( (uint32_t) FSb[ ( RK[3] >> 16 ) & 0xFF ] <<  8 ) ^
+                ( (uint32_t) FSb[ ( RK[3] >> 24 ) & 0xFF ] << 16 ) ^
+                ( (uint32_t) FSb[ ( RK[3]       ) & 0xFF ] << 24 );
+
+                RK[5]  = RK[1] ^ RK[4];
+                RK[6]  = RK[2] ^ RK[5];
+                RK[7]  = RK[3] ^ RK[6];
+            }
+            break;
+
+        case 12:
+
+            for( i = 0; i < 8; i++, RK += 6 )
+            {
+                RK[6]  = RK[0] ^ RCON[i] ^
+                ( (uint32_t) FSb[ ( RK[5] >>  8 ) & 0xFF ]       ) ^
+                ( (uint32_t) FSb[ ( RK[5] >> 16 ) & 0xFF ] <<  8 ) ^
+                ( (uint32_t) FSb[ ( RK[5] >> 24 ) & 0xFF ] << 16 ) ^
+                ( (uint32_t) FSb[ ( RK[5]       ) & 0xFF ] << 24 );
+
+                RK[7]  = RK[1] ^ RK[6];
+                RK[8]  = RK[2] ^ RK[7];
+                RK[9]  = RK[3] ^ RK[8];
+                RK[10] = RK[4] ^ RK[9];
+                RK[11] = RK[5] ^ RK[10];
+            }
+            break;
+
+        case 14:
+
+            for( i = 0; i < 7; i++, RK += 8 )
+            {
+                RK[8]  = RK[0] ^ RCON[i] ^
+                ( (uint32_t) FSb[ ( RK[7] >>  8 ) & 0xFF ]       ) ^
+                ( (uint32_t) FSb[ ( RK[7] >> 16 ) & 0xFF ] <<  8 ) ^
+                ( (uint32_t) FSb[ ( RK[7] >> 24 ) & 0xFF ] << 16 ) ^
+                ( (uint32_t) FSb[ ( RK[7]       ) & 0xFF ] << 24 );
+
+                RK[9]  = RK[1] ^ RK[8];
+                RK[10] = RK[2] ^ RK[9];
+                RK[11] = RK[3] ^ RK[10];
+
+                RK[12] = RK[4] ^
+                ( (uint32_t) FSb[ ( RK[11]       ) & 0xFF ]       ) ^
+                ( (uint32_t) FSb[ ( RK[11] >>  8 ) & 0xFF ] <<  8 ) ^
+                ( (uint32_t) FSb[ ( RK[11] >> 16 ) & 0xFF ] << 16 ) ^
+                ( (uint32_t) FSb[ ( RK[11] >> 24 ) & 0xFF ] << 24 );
+
+                RK[13] = RK[5] ^ RK[12];
+                RK[14] = RK[6] ^ RK[13];
+                RK[15] = RK[7] ^ RK[14];
+            }
+            break;
+    }
+}
+
+static void aeshw_inverse_key(uint8_t *invkey, const uint8_t *fwdkey)
+{
+  int i, j;
+  uint32_t *RK;
+  uint32_t *SK;
+
+  RK = (uint32_t *) invkey;
+  SK = ((uint32_t *) fwdkey) + AES_NR * 4;
+
+  *RK++ = *SK++;
+  *RK++ = *SK++;
+  *RK++ = *SK++;
+  *RK++ = *SK++;
+
+  for( i = AES_NR - 1, SK -= 8; i > 0; i--, SK -= 8 )
+  {
+      for( j = 0; j < 4; j++, SK++ )
+      {
+          *RK++ = RT0[ FSb[ ( *SK       ) & 0xFF ] ] ^
+                  RT1[ FSb[ ( *SK >>  8 ) & 0xFF ] ] ^
+                  RT2[ FSb[ ( *SK >> 16 ) & 0xFF ] ] ^
+                  RT3[ FSb[ ( *SK >> 24 ) & 0xFF ] ];
+      }
+  }
+
+  *RK++ = *SK++;
+  *RK++ = *SK++;
+  *RK++ = *SK++;
+  *RK++ = *SK++;
+}
+#endif /* HAVE_ARM64 */
+
+#ifdef HAVE_HARDAES
+static void aeshw_setkey_dec(uint8_t *rk, const uint8_t *key)
+{
+  uint8_t rk_tmp[AES_RKSIZE];
+  aeshw_setkey_enc(rk_tmp, key);
+  aeshw_inverse_key(rk, rk_tmp);
+}
+
+static void aeshw_encrypt_ecb( int nr,
+                               unsigned char *rk,
+                               const unsigned char input[16],
+                               unsigned char output[16] )
+{
+  aeshw_crypt_ecb(nr, rk, AES_ENCRYPT, input, output);
+}
+
+static void aeshw_decrypt_ecb( int nr,
+                               unsigned char *rk,
+                               const unsigned char input[16],
+                               unsigned char output[16] )
+{
+  aeshw_crypt_ecb(nr, rk, AES_DECRYPT, input, output);
+}
+#endif /* HAVE_HARDAES */
+
+/* OpenSSL assembly functions */
+#define AES_MAXNR 14
+typedef struct {
+  uint32_t rd_key[4 * (AES_MAXNR + 1)];
+  uint32_t rounds;
+} AES_KEY;
+
+#if defined(__amd64__) || defined(__x86_64__) || \
+    defined(__aarch64__)
+#define AES_set_encrypt_key vpaes_set_encrypt_key
+#define AES_set_decrypt_key vpaes_set_decrypt_key
+#define AES_encrypt vpaes_encrypt
+#define AES_decrypt vpaes_decrypt
+#endif /* VPAES for 64-bit Intel and ARM */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+                        AES_KEY *key);
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+                        AES_KEY *key);
+
+void AES_encrypt(const unsigned char *in, unsigned char *out,
+                 const AES_KEY *key);
+void AES_decrypt(const unsigned char *in, unsigned char *out,
+                 const AES_KEY *key);
+
+#ifdef __cplusplus
+}
+#endif
+
+static void aes_encrypt_ecb( int nr,
+                             unsigned char *rk,
+                             const unsigned char input[16],
+                             unsigned char output[16] )
+{
+  AES_encrypt(input, output, (AES_KEY *) rk);
+}
+
+static void aes_decrypt_ecb( int nr,
+                             unsigned char *rk,
+                             const unsigned char input[16],
+                             unsigned char output[16] )
+{
+  AES_decrypt(input, output, (AES_KEY *) rk);
+}
+
+static void aes_setkey_enc(uint8_t *rk, const uint8_t *key)
+{
+  AES_set_encrypt_key(key, AES_KEYSIZE, (AES_KEY *) rk);
+}
+
+static void aes_setkey_dec(uint8_t *rk, const uint8_t *key)
+{
+  AES_set_decrypt_key(key, AES_KEYSIZE, (AES_KEY *) rk);
+}
+
+static void (*encrypt_ecb) ( int nr,
+                             unsigned char *rk,
+                             const unsigned char input[16],
+                             unsigned char output[16] )
+  = aes_encrypt_ecb;
+
+static void (*decrypt_ecb) ( int nr,
+                             unsigned char *rk,
+                             const unsigned char input[16],
+                             unsigned char output[16] )
+  = aes_decrypt_ecb;
+
+static void (*setkey_enc) (uint8_t *rk, const uint8_t *key)
+  = aes_setkey_enc;
+
+static void (*setkey_dec) (uint8_t *rk, const uint8_t *key)
+  = aes_setkey_dec;
+
+/*
+ * AESNI-CBC buffer encryption/decryption
+ */
+static void encrypt_cbc( uint8_t* rk,
+                         uint32_t length,
+                         uint8_t iv[16],
+                         const uint8_t *input,
+                         uint8_t *output )
+{
+    int i;
+    uint8_t temp[16];
+
+    while( length > 0 )
+    {
+        for( i = 0; i < 16; i++ )
+            output[i] = (uint8_t)( input[i] ^ iv[i] );
+
+        encrypt_ecb( AES_NR, rk, output, output );
+        memcpy( iv, output, 16 );
+
+        input  += 16;
+        output += 16;
+        length -= 16;
+    }
+}
+
+static void decrypt_cbc( uint8_t* rk,
+                         uint32_t length,
+                         uint8_t iv[16],
+                         const uint8_t *input,
+                         uint8_t *output )
+{
+    int i;
+    uint8_t temp[16];
+
+    while( length > 0 )
+    {
+        memcpy( temp, input, 16 );
+        decrypt_ecb( AES_NR, rk, input, output );
+
+        for( i = 0; i < 16; i++ )
+            output[i] = (uint8_t)( output[i] ^ iv[i] );
+
+        memcpy( iv, temp, 16 );
+
+        input  += 16;
+        output += 16;
+        length -= 16;
+    }
+}
+
+static void aeshw_init(void)
+{
+#ifdef HAVE_HARDAES
+  static int done = 0;
+  if (!done) {
+    if (aeshw_supported()) {
+      encrypt_ecb = aeshw_encrypt_ecb;
+      decrypt_ecb = aeshw_decrypt_ecb;
+      setkey_enc = aeshw_setkey_enc;
+      setkey_dec = aeshw_setkey_dec;
+    }
+    done = 1;
+  }
+#endif
+}
+
+int AES_support_hwaccel(void)
+{
+#ifdef HAVE_HARDAES
+  return aeshw_supported();
+#else
+  return 0;
+#endif
+}
+
+void AES_CBC_encrypt_buffer(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv)
+{
+  uint8_t iv_tmp[16];
+  uint8_t rk[AES_RKSIZE];
+
+  if (key == NULL || iv == NULL)
+  {
+    return;
+  }
+  aeshw_init();
+  memcpy(iv_tmp, iv, 16);
+  setkey_enc(rk, key);
+  encrypt_cbc(rk, length, iv_tmp, input, output);
+}
+
+void AES_CBC_decrypt_buffer(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv)
+{
+  uint8_t iv_tmp[16];
+  uint8_t rk[AES_RKSIZE];
+
+  if (key == NULL || iv == NULL)
+  {
+    return;
+  }
+  aeshw_init();
+  memcpy(iv_tmp, iv, 16);
+  setkey_dec(rk, key);
+  decrypt_cbc(rk, length, iv_tmp, input, output);
+}
+
+void AES_ECB_encrypt(const uint8_t* input, const uint8_t* key, uint8_t* output, const uint32_t length)
+{
+  uint8_t rk[AES_RKSIZE];
+
+  if (key == NULL)
+  {
+    return;
+  }
+  aeshw_init();
+  setkey_enc(rk, key);
+  encrypt_ecb(AES_NR, rk, input, output);
+}
+
+void AES_ECB_decrypt(const uint8_t* input, const uint8_t* key, uint8_t *output, const uint32_t length)
+{
+  uint8_t rk[AES_RKSIZE];
+
+  if (key == NULL)
+  {
+    return;
+  }
+  aeshw_init();
+  setkey_dec(rk, key);
+  decrypt_ecb(AES_NR, rk, input, output);
+}
--- a/lib/aes_acc/aesarm.c
+++ b/lib/aes_acc/aesarm.c
@@ -0,0 +1,115 @@
+/*
+ * This file is adapted from https://github.com/CriticalBlue/mbedtls
+ */
+
+/*
+ *  ARMv8-A Cryptography Extension AES support functions
+ *
+ *  Copyright (C) 2016, CriticalBlue Limited, All Rights Reserved
+ *  SPDX-License-Identifier: Apache-2.0
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License"); you may
+ *  not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ *  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  This file is part of mbed TLS (https://tls.mbed.org)
+ */
+
+#include "aesarm.h"
+
+#if defined(HAVE_ARM64)
+
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+#include <arm_neon.h>
+
+/*
+ * ARMv8a Crypto Extension support detection routine
+ */
+int aesarm_supported( void )
+{
+    static int done = 0;
+    static unsigned int c = 0;
+
+    if ( ! done )
+    {
+        c = getauxval(AT_HWCAP);
+        done = 1;
+    }
+
+    return ( c & HWCAP_AES ) != 0;
+}
+
+/*
+ * ARMv8a AES-ECB block en(de)cryption
+ */
+void aesarm_crypt_ecb( int nr,
+                       unsigned char *rk,
+                       int mode,
+                       const unsigned char input[16],
+                       unsigned char output[16] )
+{
+    int i;
+    uint8x16_t state_vec, roundkey_vec;
+    uint8_t *RK = (uint8_t *) rk;
+
+    // Load input and round key into into their vectors
+    state_vec = vld1q_u8( input );
+
+    if ( mode == AES_ENCRYPT )
+    {
+        // Initial AddRoundKey is in the loop due to AES instruction always doing AddRoundKey first
+        for( i = 0; i < nr - 1; i++ )
+        {
+            // Load Round Key
+            roundkey_vec = vld1q_u8( RK );
+            // Forward (AESE) round (AddRoundKey, SubBytes and ShiftRows)
+            state_vec = vaeseq_u8( state_vec, roundkey_vec );
+            // Mix Columns (AESMC)
+            state_vec = vaesmcq_u8( state_vec );
+            // Move pointer ready to load next round key
+            RK += 16;
+        }
+
+        // Final Forward (AESE) round (AddRoundKey, SubBytes and ShiftRows). No Mix columns
+        roundkey_vec = vld1q_u8( RK ); /* RK already moved in loop */
+        state_vec = vaeseq_u8( state_vec, roundkey_vec );
+    }
+    else
+    {
+        // Initial AddRoundKey is in the loop due to AES instruction always doing AddRoundKey first
+        for( i = 0; i < nr - 1; i++ )
+        {
+            // Load Round Key
+            roundkey_vec = vld1q_u8( RK );
+            // Reverse (AESD) round (AddRoundKey, SubBytes and ShiftRows)
+            state_vec = vaesdq_u8( state_vec, roundkey_vec );
+            // Inverse Mix Columns (AESIMC)
+            state_vec = vaesimcq_u8( state_vec );
+            // Move pointer ready to load next round key
+            RK += 16;
+        }
+
+        // Final Reverse (AESD) round (AddRoundKey, SubBytes and ShiftRows). No Mix columns
+        roundkey_vec = vld1q_u8( RK ); /* RK already moved in loop */
+        state_vec = vaesdq_u8( state_vec, roundkey_vec );
+    }
+
+    // Manually apply final Add RoundKey step (EOR)
+    RK += 16;
+    roundkey_vec = vld1q_u8( RK );
+    state_vec = veorq_u8( state_vec, roundkey_vec );
+
+    // Write results back to output array
+    vst1q_u8( output, state_vec );
+}
+
+#endif /* HAVE_ARM64 */
--- a/lib/aes_acc/aesarm.h
+++ b/lib/aes_acc/aesarm.h
@@ -0,0 +1,84 @@
+/*
+ * This file is adapted from https://github.com/CriticalBlue/mbedtls
+ */
+
+/**
+ * \file aes_armv8a_ce.h
+ *
+ * \brief AES support functions using the ARMv8-A Cryptography Extension for
+ * hardware acceleration on some ARM processors.
+ *
+ *  Copyright (C) 2016, CriticalBlue Limited, All Rights Reserved
+ *  SPDX-License-Identifier: Apache-2.0
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License"); you may
+ *  not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ *  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  This file is part of mbed TLS (https://tls.mbed.org)
+ */
+
+#ifndef _AESARM_H_
+#define _AESARM_H_
+
+#ifndef AES_ENCRYPT
+#define AES_ENCRYPT     1
+#endif
+
+#ifndef AES_DECRYPT
+#define AES_DECRYPT     0
+#endif
+
+#if defined(__GNUC__) && \
+    __ARM_ARCH >= 8 && \
+    __ARM_ARCH_PROFILE == 'A' && \
+    defined(__aarch64__) &&  \
+    defined(__ARM_FEATURE_CRYPTO) && \
+    defined(__linux__) && \
+    !defined(NO_AESACC)
+#define HAVE_ARM64
+#endif
+
+#if defined(HAVE_ARM64)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \brief          ARMv8-A features detection routine
+ *
+ * \return         1 if the CPU has support for the feature, 0 otherwise
+ */
+int aesarm_supported( void );
+
+/**
+ * \brief          AES ARMv8-A Cryptography Extension AES-ECB block en(de)cryption
+ *
+ * \param nr       number of rounds
+ * \param rk       AES round keys
+ * \param mode     AESARM_ENCRYPT or AESARM_DECRYPT
+ * \param input    16-byte input block
+ * \param output   16-byte output block
+ */
+void aesarm_crypt_ecb( int nr,
+                       unsigned char *rk,
+                       int mode,
+                       const unsigned char input[16],
+                       unsigned char output[16] );
+
+#ifdef __cplusplus
+}
+#endif 
+
+#endif /* HAVE_ARM64 */
+
+#endif /* _AESARM_H_ */
--- a/lib/aes_acc/aesarm_table.h
+++ b/lib/aes_acc/aesarm_table.h
@@ -0,0 +1,140 @@
+/*
+ * This file is adapted from PolarSSL 1.3.19 (GPL)
+ */
+
+/*
+ * Forward S-box
+ */
+static const unsigned char FSb[256] =
+{
+    0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5,
+    0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
+    0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0,
+    0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
+    0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC,
+    0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
+    0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A,
+    0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
+    0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0,
+    0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
+    0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B,
+    0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
+    0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85,
+    0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
+    0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5,
+    0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
+    0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17,
+    0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
+    0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88,
+    0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
+    0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C,
+    0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
+    0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9,
+    0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
+    0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6,
+    0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
+    0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E,
+    0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
+    0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94,
+    0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
+    0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68,
+    0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16
+};
+
+/*
+ * Round constants
+ */
+static const uint32_t RCON[10] =
+{
+    0x00000001, 0x00000002, 0x00000004, 0x00000008,
+    0x00000010, 0x00000020, 0x00000040, 0x00000080,
+    0x0000001B, 0x00000036
+};
+
+/*
+ * Reverse tables
+ */
+#define RT \
+\
+    V(50,A7,F4,51), V(53,65,41,7E), V(C3,A4,17,1A), V(96,5E,27,3A), \
+    V(CB,6B,AB,3B), V(F1,45,9D,1F), V(AB,58,FA,AC), V(93,03,E3,4B), \
+    V(55,FA,30,20), V(F6,6D,76,AD), V(91,76,CC,88), V(25,4C,02,F5), \
+    V(FC,D7,E5,4F), V(D7,CB,2A,C5), V(80,44,35,26), V(8F,A3,62,B5), \
+    V(49,5A,B1,DE), V(67,1B,BA,25), V(98,0E,EA,45), V(E1,C0,FE,5D), \
+    V(02,75,2F,C3), V(12,F0,4C,81), V(A3,97,46,8D), V(C6,F9,D3,6B), \
+    V(E7,5F,8F,03), V(95,9C,92,15), V(EB,7A,6D,BF), V(DA,59,52,95), \
+    V(2D,83,BE,D4), V(D3,21,74,58), V(29,69,E0,49), V(44,C8,C9,8E), \
+    V(6A,89,C2,75), V(78,79,8E,F4), V(6B,3E,58,99), V(DD,71,B9,27), \
+    V(B6,4F,E1,BE), V(17,AD,88,F0), V(66,AC,20,C9), V(B4,3A,CE,7D), \
+    V(18,4A,DF,63), V(82,31,1A,E5), V(60,33,51,97), V(45,7F,53,62), \
+    V(E0,77,64,B1), V(84,AE,6B,BB), V(1C,A0,81,FE), V(94,2B,08,F9), \
+    V(58,68,48,70), V(19,FD,45,8F), V(87,6C,DE,94), V(B7,F8,7B,52), \
+    V(23,D3,73,AB), V(E2,02,4B,72), V(57,8F,1F,E3), V(2A,AB,55,66), \
+    V(07,28,EB,B2), V(03,C2,B5,2F), V(9A,7B,C5,86), V(A5,08,37,D3), \
+    V(F2,87,28,30), V(B2,A5,BF,23), V(BA,6A,03,02), V(5C,82,16,ED), \
+    V(2B,1C,CF,8A), V(92,B4,79,A7), V(F0,F2,07,F3), V(A1,E2,69,4E), \
+    V(CD,F4,DA,65), V(D5,BE,05,06), V(1F,62,34,D1), V(8A,FE,A6,C4), \
+    V(9D,53,2E,34), V(A0,55,F3,A2), V(32,E1,8A,05), V(75,EB,F6,A4), \
+    V(39,EC,83,0B), V(AA,EF,60,40), V(06,9F,71,5E), V(51,10,6E,BD), \
+    V(F9,8A,21,3E), V(3D,06,DD,96), V(AE,05,3E,DD), V(46,BD,E6,4D), \
+    V(B5,8D,54,91), V(05,5D,C4,71), V(6F,D4,06,04), V(FF,15,50,60), \
+    V(24,FB,98,19), V(97,E9,BD,D6), V(CC,43,40,89), V(77,9E,D9,67), \
+    V(BD,42,E8,B0), V(88,8B,89,07), V(38,5B,19,E7), V(DB,EE,C8,79), \
+    V(47,0A,7C,A1), V(E9,0F,42,7C), V(C9,1E,84,F8), V(00,00,00,00), \
+    V(83,86,80,09), V(48,ED,2B,32), V(AC,70,11,1E), V(4E,72,5A,6C), \
+    V(FB,FF,0E,FD), V(56,38,85,0F), V(1E,D5,AE,3D), V(27,39,2D,36), \
+    V(64,D9,0F,0A), V(21,A6,5C,68), V(D1,54,5B,9B), V(3A,2E,36,24), \
+    V(B1,67,0A,0C), V(0F,E7,57,93), V(D2,96,EE,B4), V(9E,91,9B,1B), \
+    V(4F,C5,C0,80), V(A2,20,DC,61), V(69,4B,77,5A), V(16,1A,12,1C), \
+    V(0A,BA,93,E2), V(E5,2A,A0,C0), V(43,E0,22,3C), V(1D,17,1B,12), \
+    V(0B,0D,09,0E), V(AD,C7,8B,F2), V(B9,A8,B6,2D), V(C8,A9,1E,14), \
+    V(85,19,F1,57), V(4C,07,75,AF), V(BB,DD,99,EE), V(FD,60,7F,A3), \
+    V(9F,26,01,F7), V(BC,F5,72,5C), V(C5,3B,66,44), V(34,7E,FB,5B), \
+    V(76,29,43,8B), V(DC,C6,23,CB), V(68,FC,ED,B6), V(63,F1,E4,B8), \
+    V(CA,DC,31,D7), V(10,85,63,42), V(40,22,97,13), V(20,11,C6,84), \
+    V(7D,24,4A,85), V(F8,3D,BB,D2), V(11,32,F9,AE), V(6D,A1,29,C7), \
+    V(4B,2F,9E,1D), V(F3,30,B2,DC), V(EC,52,86,0D), V(D0,E3,C1,77), \
+    V(6C,16,B3,2B), V(99,B9,70,A9), V(FA,48,94,11), V(22,64,E9,47), \
+    V(C4,8C,FC,A8), V(1A,3F,F0,A0), V(D8,2C,7D,56), V(EF,90,33,22), \
+    V(C7,4E,49,87), V(C1,D1,38,D9), V(FE,A2,CA,8C), V(36,0B,D4,98), \
+    V(CF,81,F5,A6), V(28,DE,7A,A5), V(26,8E,B7,DA), V(A4,BF,AD,3F), \
+    V(E4,9D,3A,2C), V(0D,92,78,50), V(9B,CC,5F,6A), V(62,46,7E,54), \
+    V(C2,13,8D,F6), V(E8,B8,D8,90), V(5E,F7,39,2E), V(F5,AF,C3,82), \
+    V(BE,80,5D,9F), V(7C,93,D0,69), V(A9,2D,D5,6F), V(B3,12,25,CF), \
+    V(3B,99,AC,C8), V(A7,7D,18,10), V(6E,63,9C,E8), V(7B,BB,3B,DB), \
+    V(09,78,26,CD), V(F4,18,59,6E), V(01,B7,9A,EC), V(A8,9A,4F,83), \
+    V(65,6E,95,E6), V(7E,E6,FF,AA), V(08,CF,BC,21), V(E6,E8,15,EF), \
+    V(D9,9B,E7,BA), V(CE,36,6F,4A), V(D4,09,9F,EA), V(D6,7C,B0,29), \
+    V(AF,B2,A4,31), V(31,23,3F,2A), V(30,94,A5,C6), V(C0,66,A2,35), \
+    V(37,BC,4E,74), V(A6,CA,82,FC), V(B0,D0,90,E0), V(15,D8,A7,33), \
+    V(4A,98,04,F1), V(F7,DA,EC,41), V(0E,50,CD,7F), V(2F,F6,91,17), \
+    V(8D,D6,4D,76), V(4D,B0,EF,43), V(54,4D,AA,CC), V(DF,04,96,E4), \
+    V(E3,B5,D1,9E), V(1B,88,6A,4C), V(B8,1F,2C,C1), V(7F,51,65,46), \
+    V(04,EA,5E,9D), V(5D,35,8C,01), V(73,74,87,FA), V(2E,41,0B,FB), \
+    V(5A,1D,67,B3), V(52,D2,DB,92), V(33,56,10,E9), V(13,47,D6,6D), \
+    V(8C,61,D7,9A), V(7A,0C,A1,37), V(8E,14,F8,59), V(89,3C,13,EB), \
+    V(EE,27,A9,CE), V(35,C9,61,B7), V(ED,E5,1C,E1), V(3C,B1,47,7A), \
+    V(59,DF,D2,9C), V(3F,73,F2,55), V(79,CE,14,18), V(BF,37,C7,73), \
+    V(EA,CD,F7,53), V(5B,AA,FD,5F), V(14,6F,3D,DF), V(86,DB,44,78), \
+    V(81,F3,AF,CA), V(3E,C4,68,B9), V(2C,34,24,38), V(5F,40,A3,C2), \
+    V(72,C3,1D,16), V(0C,25,E2,BC), V(8B,49,3C,28), V(41,95,0D,FF), \
+    V(71,01,A8,39), V(DE,B3,0C,08), V(9C,E4,B4,D8), V(90,C1,56,64), \
+    V(61,84,CB,7B), V(70,B6,32,D5), V(74,5C,6C,48), V(42,57,B8,D0)
+
+#define V(a,b,c,d) 0x##a##b##c##d
+static const uint32_t RT0[256] = { RT };
+#undef V
+
+#define V(a,b,c,d) 0x##b##c##d##a
+static const uint32_t RT1[256] = { RT };
+#undef V
+
+#define V(a,b,c,d) 0x##c##d##a##b
+static const uint32_t RT2[256] = { RT };
+#undef V
+
+#define V(a,b,c,d) 0x##d##a##b##c
+static const uint32_t RT3[256] = { RT };
+#undef V
+
+#undef RT
--- a/lib/aes_acc/aesni.c
+++ b/lib/aes_acc/aesni.c
@@ -0,0 +1,324 @@
+/*
+ * This file is adapted from PolarSSL 1.3.19 (GPL)
+ */
+
+/*
+ *  AES-NI support functions
+ *
+ *  Copyright (C) 2006-2014, ARM Limited, All Rights Reserved
+ *
+ *  This file is part of mbed TLS (https://tls.mbed.org)
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/*
+ * [AES-WP] http://software.intel.com/en-us/articles/intel-advanced-encryption-standard-aes-instructions-set
+ * [CLMUL-WP] http://software.intel.com/en-us/articles/intel-carry-less-multiplication-instruction-and-its-usage-for-computing-the-gcm-mode/
+ */
+
+#include <string.h>
+#include "aesni.h"
+
+#if defined(HAVE_AMD64)
+
+/*
+ * AES-NI support detection routine
+ */
+#define AESNI_AES 0x02000000u
+
+int aesni_supported( void )
+{
+    static int done = 0;
+    static unsigned int c = 0;
+
+    if( ! done )
+    {
+        asm( "movl  $1, %%eax   \n\t"
+             "cpuid             \n\t"
+             : "=c" (c)
+             :
+             : "eax", "ebx", "edx" );
+        done = 1;
+    }
+
+    return( ( c & AESNI_AES ) != 0 );
+}
+
+/*
+ * Binutils needs to be at least 2.19 to support AES-NI instructions.
+ * Unfortunately, a lot of users have a lower version now (2014-04).
+ * Emit bytecode directly in order to support "old" version of gas.
+ *
+ * Opcodes from the Intel architecture reference manual, vol. 3.
+ * We always use registers, so we don't need prefixes for memory operands.
+ * Operand macros are in gas order (src, dst) as opposed to Intel order
+ * (dst, src) in order to blend better into the surrounding assembly code.
+ */
+#define AESDEC      ".byte 0x66,0x0F,0x38,0xDE,"
+#define AESDECLAST  ".byte 0x66,0x0F,0x38,0xDF,"
+#define AESENC      ".byte 0x66,0x0F,0x38,0xDC,"
+#define AESENCLAST  ".byte 0x66,0x0F,0x38,0xDD,"
+#define AESIMC      ".byte 0x66,0x0F,0x38,0xDB,"
+#define AESKEYGENA  ".byte 0x66,0x0F,0x3A,0xDF,"
+#define PCLMULQDQ   ".byte 0x66,0x0F,0x3A,0x44,"
+
+#define xmm0_xmm0   "0xC0"
+#define xmm0_xmm1   "0xC8"
+#define xmm0_xmm2   "0xD0"
+#define xmm0_xmm3   "0xD8"
+#define xmm0_xmm4   "0xE0"
+#define xmm1_xmm0   "0xC1"
+#define xmm1_xmm2   "0xD1"
+
+/*
+ * AES-NI AES-ECB block en(de)cryption
+ */
+void aesni_crypt_ecb( int nr,
+                      unsigned char *rk,
+                      int mode,
+                      const unsigned char input[16],
+                      unsigned char output[16] )
+{
+    asm( "movdqu    (%3), %%xmm0    \n\t" // load input
+         "movdqu    (%1), %%xmm1    \n\t" // load round key 0
+         "pxor      %%xmm1, %%xmm0  \n\t" // round 0
+         "addq      $16, %1         \n\t" // point to next round key
+         "subl      $1, %0          \n\t" // normal rounds = nr - 1
+         "test      %2, %2          \n\t" // mode?
+         "jz        2f              \n\t" // 0 = decrypt
+
+         "1:                        \n\t" // encryption loop
+         "movdqu    (%1), %%xmm1    \n\t" // load round key
+         AESENC     xmm1_xmm0      "\n\t" // do round
+         "addq      $16, %1         \n\t" // point to next round key
+         "subl      $1, %0          \n\t" // loop
+         "jnz       1b              \n\t"
+         "movdqu    (%1), %%xmm1    \n\t" // load round key
+         AESENCLAST xmm1_xmm0      "\n\t" // last round
+         "jmp       3f              \n\t"
+
+         "2:                        \n\t" // decryption loop
+         "movdqu    (%1), %%xmm1    \n\t"
+         AESDEC     xmm1_xmm0      "\n\t" // do round
+         "addq      $16, %1         \n\t"
+         "subl      $1, %0          \n\t"
+         "jnz       2b              \n\t"
+         "movdqu    (%1), %%xmm1    \n\t" // load round key
+         AESDECLAST xmm1_xmm0      "\n\t" // last round
+
+         "3:                        \n\t"
+         "movdqu    %%xmm0, (%4)    \n\t" // export output
+         :
+         : "r" (nr), "r" (rk), "r" (mode), "r" (input), "r" (output)
+         : "memory", "cc", "xmm0", "xmm1" );
+}
+
+/*
+ * Compute decryption round keys from encryption round keys
+ */
+void aesni_inverse_key( unsigned char *invkey,
+                        const unsigned char *fwdkey, int nr )
+{
+    unsigned char *ik = invkey;
+    const unsigned char *fk = fwdkey + 16 * nr;
+
+    memcpy( ik, fk, 16 );
+
+    for( fk -= 16, ik += 16; fk > fwdkey; fk -= 16, ik += 16 )
+        asm( "movdqu (%0), %%xmm0       \n\t"
+             AESIMC  xmm0_xmm0         "\n\t"
+             "movdqu %%xmm0, (%1)       \n\t"
+             :
+             : "r" (fk), "r" (ik)
+             : "memory", "xmm0" );
+
+    memcpy( ik, fk, 16 );
+}
+
+/*
+ * Key expansion, 128-bit case
+ */
+void aesni_setkey_enc_128( unsigned char *rk,
+                           const unsigned char *key )
+{
+    asm( "movdqu (%1), %%xmm0               \n\t" // copy the original key
+         "movdqu %%xmm0, (%0)               \n\t" // as round key 0
+         "jmp 2f                            \n\t" // skip auxiliary routine
+
+         /*
+          * Finish generating the next round key.
+          *
+          * On entry xmm0 is r3:r2:r1:r0 and xmm1 is X:stuff:stuff:stuff
+          * with X = rot( sub( r3 ) ) ^ RCON.
+          *
+          * On exit, xmm0 is r7:r6:r5:r4
+          * with r4 = X + r0, r5 = r4 + r1, r6 = r5 + r2, r7 = r6 + r3
+          * and those are written to the round key buffer.
+          */
+         "1:                                \n\t"
+         "pshufd $0xff, %%xmm1, %%xmm1      \n\t" // X:X:X:X
+         "pxor %%xmm0, %%xmm1               \n\t" // X+r3:X+r2:X+r1:r4
+         "pslldq $4, %%xmm0                 \n\t" // r2:r1:r0:0
+         "pxor %%xmm0, %%xmm1               \n\t" // X+r3+r2:X+r2+r1:r5:r4
+         "pslldq $4, %%xmm0                 \n\t" // etc
+         "pxor %%xmm0, %%xmm1               \n\t"
+         "pslldq $4, %%xmm0                 \n\t"
+         "pxor %%xmm1, %%xmm0               \n\t" // update xmm0 for next time!
+         "add $16, %0                       \n\t" // point to next round key
+         "movdqu %%xmm0, (%0)               \n\t" // write it
+         "ret                               \n\t"
+
+         /* Main "loop" */
+         "2:                                \n\t"
+         AESKEYGENA xmm0_xmm1 ",0x01        \n\tcall 1b \n\t"
+         AESKEYGENA xmm0_xmm1 ",0x02        \n\tcall 1b \n\t"
+         AESKEYGENA xmm0_xmm1 ",0x04        \n\tcall 1b \n\t"
+         AESKEYGENA xmm0_xmm1 ",0x08        \n\tcall 1b \n\t"
+         AESKEYGENA xmm0_xmm1 ",0x10        \n\tcall 1b \n\t"
+         AESKEYGENA xmm0_xmm1 ",0x20        \n\tcall 1b \n\t"
+         AESKEYGENA xmm0_xmm1 ",0x40        \n\tcall 1b \n\t"
+         AESKEYGENA xmm0_xmm1 ",0x80        \n\tcall 1b \n\t"
+         AESKEYGENA xmm0_xmm1 ",0x1B        \n\tcall 1b \n\t"
+         AESKEYGENA xmm0_xmm1 ",0x36        \n\tcall 1b \n\t"
+         :
+         : "r" (rk), "r" (key)
+         : "memory", "cc", "0" );
+}
+
+/*
+ * Key expansion, 192-bit case
+ */
+void aesni_setkey_enc_192( unsigned char *rk,
+                           const unsigned char *key )
+{
+    asm( "movdqu (%1), %%xmm0   \n\t" // copy original round key
+         "movdqu %%xmm0, (%0)   \n\t"
+         "add $16, %0           \n\t"
+         "movq 16(%1), %%xmm1   \n\t"
+         "movq %%xmm1, (%0)     \n\t"
+         "add $8, %0            \n\t"
+         "jmp 2f                \n\t" // skip auxiliary routine
+
+         /*
+          * Finish generating the next 6 quarter-keys.
+          *
+          * On entry xmm0 is r3:r2:r1:r0, xmm1 is stuff:stuff:r5:r4
+          * and xmm2 is stuff:stuff:X:stuff with X = rot( sub( r3 ) ) ^ RCON.
+          *
+          * On exit, xmm0 is r9:r8:r7:r6 and xmm1 is stuff:stuff:r11:r10
+          * and those are written to the round key buffer.
+          */
+         "1:                            \n\t"
+         "pshufd $0x55, %%xmm2, %%xmm2  \n\t" // X:X:X:X
+         "pxor %%xmm0, %%xmm2           \n\t" // X+r3:X+r2:X+r1:r4
+         "pslldq $4, %%xmm0             \n\t" // etc
+         "pxor %%xmm0, %%xmm2           \n\t"
+         "pslldq $4, %%xmm0             \n\t"
+         "pxor %%xmm0, %%xmm2           \n\t"
+         "pslldq $4, %%xmm0             \n\t"
+         "pxor %%xmm2, %%xmm0           \n\t" // update xmm0 = r9:r8:r7:r6
+         "movdqu %%xmm0, (%0)           \n\t"
+         "add $16, %0                   \n\t"
+         "pshufd $0xff, %%xmm0, %%xmm2  \n\t" // r9:r9:r9:r9
+         "pxor %%xmm1, %%xmm2           \n\t" // stuff:stuff:r9+r5:r10
+         "pslldq $4, %%xmm1             \n\t" // r2:r1:r0:0
+         "pxor %%xmm2, %%xmm1           \n\t" // xmm1 = stuff:stuff:r11:r10
+         "movq %%xmm1, (%0)             \n\t"
+         "add $8, %0                    \n\t"
+         "ret                           \n\t"
+
+         "2:                            \n\t"
+         AESKEYGENA xmm1_xmm2 ",0x01    \n\tcall 1b \n\t"
+         AESKEYGENA xmm1_xmm2 ",0x02    \n\tcall 1b \n\t"
+         AESKEYGENA xmm1_xmm2 ",0x04    \n\tcall 1b \n\t"
+         AESKEYGENA xmm1_xmm2 ",0x08    \n\tcall 1b \n\t"
+         AESKEYGENA xmm1_xmm2 ",0x10    \n\tcall 1b \n\t"
+         AESKEYGENA xmm1_xmm2 ",0x20    \n\tcall 1b \n\t"
+         AESKEYGENA xmm1_xmm2 ",0x40    \n\tcall 1b \n\t"
+         AESKEYGENA xmm1_xmm2 ",0x80    \n\tcall 1b \n\t"
+
+         :
+         : "r" (rk), "r" (key)
+         : "memory", "cc", "0" );
+}
+
+/*
+ * Key expansion, 256-bit case
+ */
+void aesni_setkey_enc_256( unsigned char *rk,
+                           const unsigned char *key )
+{
+    asm( "movdqu (%1), %%xmm0           \n\t"
+         "movdqu %%xmm0, (%0)           \n\t"
+         "add $16, %0                   \n\t"
+         "movdqu 16(%1), %%xmm1         \n\t"
+         "movdqu %%xmm1, (%0)           \n\t"
+         "jmp 2f                        \n\t" // skip auxiliary routine
+
+         /*
+          * Finish generating the next two round keys.
+          *
+          * On entry xmm0 is r3:r2:r1:r0, xmm1 is r7:r6:r5:r4 and
+          * xmm2 is X:stuff:stuff:stuff with X = rot( sub( r7 )) ^ RCON
+          *
+          * On exit, xmm0 is r11:r10:r9:r8 and xmm1 is r15:r14:r13:r12
+          * and those have been written to the output buffer.
+          */
+         "1:                                \n\t"
+         "pshufd $0xff, %%xmm2, %%xmm2      \n\t"
+         "pxor %%xmm0, %%xmm2               \n\t"
+         "pslldq $4, %%xmm0                 \n\t"
+         "pxor %%xmm0, %%xmm2               \n\t"
+         "pslldq $4, %%xmm0                 \n\t"
+         "pxor %%xmm0, %%xmm2               \n\t"
+         "pslldq $4, %%xmm0                 \n\t"
+         "pxor %%xmm2, %%xmm0               \n\t"
+         "add $16, %0                       \n\t"
+         "movdqu %%xmm0, (%0)               \n\t"
+
+         /* Set xmm2 to stuff:Y:stuff:stuff with Y = subword( r11 )
+          * and proceed to generate next round key from there */
+         AESKEYGENA xmm0_xmm2 ",0x00        \n\t"
+         "pshufd $0xaa, %%xmm2, %%xmm2      \n\t"
+         "pxor %%xmm1, %%xmm2               \n\t"
+         "pslldq $4, %%xmm1                 \n\t"
+         "pxor %%xmm1, %%xmm2               \n\t"
+         "pslldq $4, %%xmm1                 \n\t"
+         "pxor %%xmm1, %%xmm2               \n\t"
+         "pslldq $4, %%xmm1                 \n\t"
+         "pxor %%xmm2, %%xmm1               \n\t"
+         "add $16, %0                       \n\t"
+         "movdqu %%xmm1, (%0)               \n\t"
+         "ret                               \n\t"
+
+         /*
+          * Main "loop" - Generating one more key than necessary,
+          * see definition of aes_context.buf
+          */
+         "2:                                \n\t"
+         AESKEYGENA xmm1_xmm2 ",0x01        \n\tcall 1b \n\t"
+         AESKEYGENA xmm1_xmm2 ",0x02        \n\tcall 1b \n\t"
+         AESKEYGENA xmm1_xmm2 ",0x04        \n\tcall 1b \n\t"
+         AESKEYGENA xmm1_xmm2 ",0x08        \n\tcall 1b \n\t"
+         AESKEYGENA xmm1_xmm2 ",0x10        \n\tcall 1b \n\t"
+         AESKEYGENA xmm1_xmm2 ",0x20        \n\tcall 1b \n\t"
+         AESKEYGENA xmm1_xmm2 ",0x40        \n\tcall 1b \n\t"
+         :
+         : "r" (rk), "r" (key)
+         : "memory", "cc", "0" );
+}
+
+#endif /* HAVE_AMD64 */
--- a/lib/aes_acc/aesni.h
+++ b/lib/aes_acc/aesni.h
@@ -0,0 +1,117 @@
+/*
+ * This file is adapted from PolarSSL 1.3.19 (GPL)
+ */
+
+/**
+ * \file aesni.h
+ *
+ * \brief AES-NI for hardware AES acceleration on some Intel processors
+ *
+ *  Copyright (C) 2013, ARM Limited, All Rights Reserved
+ *
+ *  This file is part of mbed TLS (https://tls.mbed.org)
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef _AESNI_H_
+#define _AESNI_H_
+
+#ifndef AES_ENCRYPT
+#define AES_ENCRYPT     1
+#endif
+
+#ifndef AES_DECRYPT
+#define AES_DECRYPT     0
+#endif
+
+#if defined(__GNUC__) &&  \
+    ( defined(__amd64__) || defined(__x86_64__) ) && \
+    !defined(NO_AESACC)
+#define HAVE_AMD64
+#endif
+
+#if defined(HAVE_AMD64)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \brief          AES-NI features detection routine
+ *
+ * \return         1 if CPU has support for AES-NI, 0 otherwise
+ */
+int aesni_supported( void );
+
+/**
+ * \brief          AES-NI AES-ECB block en(de)cryption
+ *
+ * \param nr       number of rounds
+ * \param rk       AES round keys
+ * \param mode     AES_ENCRYPT or AES_DECRYPT
+ * \param input    16-byte input block
+ * \param output   16-byte output block
+ */
+void aesni_crypt_ecb( int nr,
+                      unsigned char *rk,
+                      int mode,
+                      const unsigned char input[16],
+                      unsigned char output[16] );
+
+/**
+ * \brief           Compute decryption round keys from encryption round keys
+ *
+ * \param invkey    Round keys for the equivalent inverse cipher
+ * \param fwdkey    Original round keys (for encryption)
+ * \param nr        Number of rounds (that is, number of round keys minus one)
+ */
+void aesni_inverse_key( unsigned char *invkey,
+                        const unsigned char *fwdkey, int nr );
+
+/**
+ * \brief           Perform 128-bit key expansion (for encryption)
+ *
+ * \param rk        Destination buffer where the round keys are written
+ * \param key       Encryption key
+ */
+void aesni_setkey_enc_128( unsigned char *rk,
+                           const unsigned char *key );
+
+/**
+ * \brief           Perform 192-bit key expansion (for encryption)
+ *
+ * \param rk        Destination buffer where the round keys are written
+ * \param key       Encryption key
+ */
+void aesni_setkey_enc_192( unsigned char *rk,
+                           const unsigned char *key );
+
+/**
+ * \brief           Perform 256-bit key expansion (for encryption)
+ *
+ * \param rk        Destination buffer where the round keys are written
+ * \param key       Encryption key
+ */
+void aesni_setkey_enc_256( unsigned char *rk,
+                           const unsigned char *key );
+
+#ifdef __cplusplus
+}
+#endif 
+
+#endif /* HAVE_AMD64 */
+
+#endif /* _AESNI_H_ */
--- a/lib/aes_acc/asm/arm.S
+++ b/lib/aes_acc/asm/arm.S
--- a/lib/aes_acc/asm/arm64.S
+++ b/lib/aes_acc/asm/arm64.S
--- a/lib/aes_acc/asm/arm_arch.h
+++ b/lib/aes_acc/asm/arm_arch.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#ifndef __ARM_ARCH_H__
+# define __ARM_ARCH_H__
+
+# if !defined(__ARM_ARCH__)
+#  if defined(__CC_ARM)
+#   define __ARM_ARCH__ __TARGET_ARCH_ARM
+#   if defined(__BIG_ENDIAN)
+#    define __ARMEB__
+#   else
+#    define __ARMEL__
+#   endif
+#  elif defined(__GNUC__)
+#   if   defined(__aarch64__)
+#    define __ARM_ARCH__ 8
+#    if __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
+#     define __ARMEB__
+#    else
+#     define __ARMEL__
+#    endif
+  /*
+   * Why doesn't gcc define __ARM_ARCH__? Instead it defines
+   * bunch of below macros. See all_architectires[] table in
+   * gcc/config/arm/arm.c. On a side note it defines
+   * __ARMEL__/__ARMEB__ for little-/big-endian.
+   */
+#   elif defined(__ARM_ARCH)
+#    define __ARM_ARCH__ __ARM_ARCH
+#   elif defined(__ARM_ARCH_8A__)
+#    define __ARM_ARCH__ 8
+#   elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__)     || \
+        defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__)     || \
+        defined(__ARM_ARCH_7EM__)
+#    define __ARM_ARCH__ 7
+#   elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__)     || \
+        defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__)     || \
+        defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__)    || \
+        defined(__ARM_ARCH_6T2__)
+#    define __ARM_ARCH__ 6
+#   elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__)     || \
+        defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__)    || \
+        defined(__ARM_ARCH_5TEJ__)
+#    define __ARM_ARCH__ 5
+#   elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__)
+#    define __ARM_ARCH__ 4
+#   else
+#    error "unsupported ARM architecture"
+#   endif
+#  endif
+# endif
+
+# if !defined(__ARM_MAX_ARCH__)
+#  define __ARM_MAX_ARCH__ __ARM_ARCH__
+# endif
+
+# if __ARM_MAX_ARCH__<__ARM_ARCH__
+#  error "__ARM_MAX_ARCH__ can't be less than __ARM_ARCH__"
+# elif __ARM_MAX_ARCH__!=__ARM_ARCH__
+#  if __ARM_ARCH__<7 && __ARM_MAX_ARCH__>=7 && defined(__ARMEB__)
+#   error "can't build universal big-endian binary"
+#  endif
+# endif
+
+# if !__ASSEMBLER__
+extern unsigned int OPENSSL_armcap_P;
+# endif
+
+# define ARMV7_NEON      (1<<0)
+# define ARMV7_TICK      (1<<1)
+# define ARMV8_AES       (1<<2)
+# define ARMV8_SHA1      (1<<3)
+# define ARMV8_SHA256    (1<<4)
+# define ARMV8_PMULL     (1<<5)
+
+#endif
--- a/lib/aes_acc/asm/mips.S
+++ b/lib/aes_acc/asm/mips.S
--- a/lib/aes_acc/asm/mips_be.S
+++ b/lib/aes_acc/asm/mips_be.S
--- a/lib/aes_acc/asm/x64.S
+++ b/lib/aes_acc/asm/x64.S
@@ -0,0 +1,827 @@
+.text	
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_encrypt_core,@function
+.align	16
+_vpaes_encrypt_core:
+	movq	%rdx,%r9
+	movq	$16,%r11
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	.Lk_ipt(%rip),%xmm2
+	pandn	%xmm0,%xmm1
+	movdqu	(%r9),%xmm5
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	.Lk_ipt+16(%rip),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm5,%xmm2
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+	leaq	.Lk_mc_backward(%rip),%r10
+	jmp	.Lenc_entry
+
+.align	16
+.Lenc_loop:
+
+	movdqa	%xmm13,%xmm4
+	movdqa	%xmm12,%xmm0
+.byte	102,15,56,0,226
+.byte	102,15,56,0,195
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm15,%xmm5
+	pxor	%xmm4,%xmm0
+	movdqa	-64(%r11,%r10,1),%xmm1
+.byte	102,15,56,0,234
+	movdqa	(%r11,%r10,1),%xmm4
+	movdqa	%xmm14,%xmm2
+.byte	102,15,56,0,211
+	movdqa	%xmm0,%xmm3
+	pxor	%xmm5,%xmm2
+.byte	102,15,56,0,193
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+.byte	102,15,56,0,220
+	addq	$16,%r11
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	andq	$0x30,%r11
+	subq	$1,%rax
+	pxor	%xmm3,%xmm0
+
+.Lenc_entry:
+
+	movdqa	%xmm9,%xmm1
+	movdqa	%xmm11,%xmm5
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,232
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm10,%xmm4
+	pxor	%xmm5,%xmm3
+.byte	102,15,56,0,224
+	movdqa	%xmm10,%xmm2
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%r9),%xmm5
+	pxor	%xmm1,%xmm3
+	jnz	.Lenc_loop
+
+
+	movdqa	-96(%r10),%xmm4
+	movdqa	-80(%r10),%xmm0
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,195
+	movdqa	64(%r11,%r10,1),%xmm1
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,193
+	.byte	0xf3,0xc3
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+
+
+
+
+
+.type	_vpaes_decrypt_core,@function
+.align	16
+_vpaes_decrypt_core:
+	movq	%rdx,%r9
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	.Lk_dipt(%rip),%xmm2
+	pandn	%xmm0,%xmm1
+	movq	%rax,%r11
+	psrld	$4,%xmm1
+	movdqu	(%r9),%xmm5
+	shlq	$4,%r11
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	.Lk_dipt+16(%rip),%xmm0
+	xorq	$0x30,%r11
+	leaq	.Lk_dsbd(%rip),%r10
+.byte	102,15,56,0,193
+	andq	$0x30,%r11
+	pxor	%xmm5,%xmm2
+	movdqa	.Lk_mc_forward+48(%rip),%xmm5
+	pxor	%xmm2,%xmm0
+	addq	$16,%r9
+	addq	%r10,%r11
+	jmp	.Ldec_entry
+
+.align	16
+.Ldec_loop:
+
+
+
+	movdqa	-32(%r10),%xmm4
+	movdqa	-16(%r10),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	0(%r10),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	16(%r10),%xmm1
+
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	32(%r10),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	48(%r10),%xmm1
+
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	64(%r10),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	80(%r10),%xmm1
+
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	addq	$16,%r9
+.byte	102,15,58,15,237,12
+	pxor	%xmm1,%xmm0
+	subq	$1,%rax
+
+.Ldec_entry:
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	movdqa	%xmm11,%xmm2
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm10,%xmm4
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%r9),%xmm0
+	pxor	%xmm1,%xmm3
+	jnz	.Ldec_loop
+
+
+	movdqa	96(%r10),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	112(%r10),%xmm0
+	movdqa	-352(%r11),%xmm2
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,194
+	.byte	0xf3,0xc3
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+
+
+
+
+
+.type	_vpaes_schedule_core,@function
+.align	16
+_vpaes_schedule_core:
+
+
+
+
+
+	call	_vpaes_preheat
+	movdqa	.Lk_rcon(%rip),%xmm8
+	movdqu	(%rdi),%xmm0
+
+
+	movdqa	%xmm0,%xmm3
+	leaq	.Lk_ipt(%rip),%r11
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm7
+
+	leaq	.Lk_sr(%rip),%r10
+	testq	%rcx,%rcx
+	jnz	.Lschedule_am_decrypting
+
+
+	movdqu	%xmm0,(%rdx)
+	jmp	.Lschedule_go
+
+.Lschedule_am_decrypting:
+
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,217
+	movdqu	%xmm3,(%rdx)
+	xorq	$0x30,%r8
+
+.Lschedule_go:
+	cmpl	$192,%esi
+	ja	.Lschedule_256
+	je	.Lschedule_192
+
+
+
+
+
+
+
+
+
+
+.Lschedule_128:
+	movl	$10,%esi
+
+.Loop_schedule_128:
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle
+	jmp	.Loop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_192:
+	movdqu	8(%rdi),%xmm0
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm6
+	pxor	%xmm4,%xmm4
+	movhlps	%xmm4,%xmm6
+	movl	$4,%esi
+
+.Loop_schedule_192:
+	call	_vpaes_schedule_round
+.byte	102,15,58,15,198,8
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_192_smear
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_192_smear
+	jmp	.Loop_schedule_192
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_256:
+	movdqu	16(%rdi),%xmm0
+	call	_vpaes_schedule_transform
+	movl	$7,%esi
+
+.Loop_schedule_256:
+	call	_vpaes_schedule_mangle
+	movdqa	%xmm0,%xmm6
+
+
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle
+
+
+	pshufd	$0xFF,%xmm0,%xmm0
+	movdqa	%xmm7,%xmm5
+	movdqa	%xmm6,%xmm7
+	call	_vpaes_schedule_low_round
+	movdqa	%xmm5,%xmm7
+
+	jmp	.Loop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_mangle_last:
+
+	leaq	.Lk_deskew(%rip),%r11
+	testq	%rcx,%rcx
+	jnz	.Lschedule_mangle_last_dec
+
+
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,193
+	leaq	.Lk_opt(%rip),%r11
+	addq	$32,%rdx
+
+.Lschedule_mangle_last_dec:
+	addq	$-16,%rdx
+	pxor	.Lk_s63(%rip),%xmm0
+	call	_vpaes_schedule_transform
+	movdqu	%xmm0,(%rdx)
+
+
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_192_smear,@function
+.align	16
+_vpaes_schedule_192_smear:
+	pshufd	$0x80,%xmm6,%xmm1
+	pshufd	$0xFE,%xmm7,%xmm0
+	pxor	%xmm1,%xmm6
+	pxor	%xmm1,%xmm1
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm6,%xmm0
+	movhlps	%xmm1,%xmm6
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_round,@function
+.align	16
+_vpaes_schedule_round:
+
+	pxor	%xmm1,%xmm1
+.byte	102,65,15,58,15,200,15
+.byte	102,69,15,58,15,192,15
+	pxor	%xmm1,%xmm7
+
+
+	pshufd	$0xFF,%xmm0,%xmm0
+.byte	102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+	movdqa	%xmm7,%xmm1
+	pslldq	$4,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm7,%xmm1
+	pslldq	$8,%xmm7
+	pxor	%xmm1,%xmm7
+	pxor	.Lk_s63(%rip),%xmm7
+
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	%xmm11,%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm10,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqa	%xmm13,%xmm4
+.byte	102,15,56,0,226
+	movdqa	%xmm12,%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+
+
+	pxor	%xmm7,%xmm0
+	movdqa	%xmm0,%xmm7
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_transform,@function
+.align	16
+_vpaes_schedule_transform:
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	(%r11),%xmm2
+.byte	102,15,56,0,208
+	movdqa	16(%r11),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm2,%xmm0
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_mangle,@function
+.align	16
+_vpaes_schedule_mangle:
+	movdqa	%xmm0,%xmm4
+	movdqa	.Lk_mc_forward(%rip),%xmm5
+	testq	%rcx,%rcx
+	jnz	.Lschedule_mangle_dec
+
+
+	addq	$16,%rdx
+	pxor	.Lk_s63(%rip),%xmm4
+.byte	102,15,56,0,229
+	movdqa	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+
+	jmp	.Lschedule_mangle_both
+.align	16
+.Lschedule_mangle_dec:
+
+	leaq	.Lk_dksd(%rip),%r11
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm4,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm4
+
+	movdqa	0(%r11),%xmm2
+.byte	102,15,56,0,212
+	movdqa	16(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	32(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	48(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	64(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	80(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	96(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	112(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+
+	addq	$-16,%rdx
+
+.Lschedule_mangle_both:
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,217
+	addq	$-16,%r8
+	andq	$0x30,%r8
+	movdqu	%xmm3,(%rdx)
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+
+
+
+.globl	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,@function
+.align	16
+vpaes_set_encrypt_key:
+	movl	%esi,%eax
+	shrl	$5,%eax
+	addl	$5,%eax
+	movl	%eax,240(%rdx)
+
+	movl	$0,%ecx
+	movl	$0x30,%r8d
+	call	_vpaes_schedule_core
+	xorl	%eax,%eax
+	.byte	0xf3,0xc3
+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl	vpaes_set_decrypt_key
+.type	vpaes_set_decrypt_key,@function
+.align	16
+vpaes_set_decrypt_key:
+	movl	%esi,%eax
+	shrl	$5,%eax
+	addl	$5,%eax
+	movl	%eax,240(%rdx)
+	shll	$4,%eax
+	leaq	16(%rdx,%rax,1),%rdx
+
+	movl	$1,%ecx
+	movl	%esi,%r8d
+	shrl	$1,%r8d
+	andl	$32,%r8d
+	xorl	$32,%r8d
+	call	_vpaes_schedule_core
+	xorl	%eax,%eax
+	.byte	0xf3,0xc3
+.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+
+.globl	vpaes_encrypt
+.type	vpaes_encrypt,@function
+.align	16
+vpaes_encrypt:
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_encrypt_core
+	movdqu	%xmm0,(%rsi)
+	.byte	0xf3,0xc3
+.size	vpaes_encrypt,.-vpaes_encrypt
+
+.globl	vpaes_decrypt
+.type	vpaes_decrypt,@function
+.align	16
+vpaes_decrypt:
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_decrypt_core
+	movdqu	%xmm0,(%rsi)
+	.byte	0xf3,0xc3
+.size	vpaes_decrypt,.-vpaes_decrypt
+.globl	vpaes_cbc_encrypt
+.type	vpaes_cbc_encrypt,@function
+.align	16
+vpaes_cbc_encrypt:
+	xchgq	%rcx,%rdx
+	subq	$16,%rcx
+	jc	.Lcbc_abort
+	movdqu	(%r8),%xmm6
+	subq	%rdi,%rsi
+	call	_vpaes_preheat
+	cmpl	$0,%r9d
+	je	.Lcbc_dec_loop
+	jmp	.Lcbc_enc_loop
+.align	16
+.Lcbc_enc_loop:
+	movdqu	(%rdi),%xmm0
+	pxor	%xmm6,%xmm0
+	call	_vpaes_encrypt_core
+	movdqa	%xmm0,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	leaq	16(%rdi),%rdi
+	subq	$16,%rcx
+	jnc	.Lcbc_enc_loop
+	jmp	.Lcbc_done
+.align	16
+.Lcbc_dec_loop:
+	movdqu	(%rdi),%xmm0
+	movdqa	%xmm0,%xmm7
+	call	_vpaes_decrypt_core
+	pxor	%xmm6,%xmm0
+	movdqa	%xmm7,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	leaq	16(%rdi),%rdi
+	subq	$16,%rcx
+	jnc	.Lcbc_dec_loop
+.Lcbc_done:
+	movdqu	%xmm6,(%r8)
+.Lcbc_abort:
+	.byte	0xf3,0xc3
+.size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+
+
+
+
+
+
+.type	_vpaes_preheat,@function
+.align	16
+_vpaes_preheat:
+	leaq	.Lk_s0F(%rip),%r10
+	movdqa	-32(%r10),%xmm10
+	movdqa	-16(%r10),%xmm11
+	movdqa	0(%r10),%xmm9
+	movdqa	48(%r10),%xmm13
+	movdqa	64(%r10),%xmm12
+	movdqa	80(%r10),%xmm15
+	movdqa	96(%r10),%xmm14
+	.byte	0xf3,0xc3
+.size	_vpaes_preheat,.-_vpaes_preheat
+
+
+
+
+
+.type	_vpaes_consts,@object
+.align	64
+_vpaes_consts:
+.Lk_inv:
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F:
+.quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt:
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1:
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr:
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon:
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63:
+.quad	0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt:
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+
+
+
+.Lk_dksd:
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+
+
+
+
+.Lk_dipt:
+.quad	0x0F505B040B545F00, 0x154A411E114E451A
+.quad	0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9:
+.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:
+.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:
+.quad	0xD022649296B44200, 0x602646F6B0F2D404
+.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:
+.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo:
+.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	64
+.size	_vpaes_consts,.-_vpaes_consts
--- a/lib/aes_acc/asm/x86.S
+++ b/lib/aes_acc/asm/x86.S
--- a/lib/md5.c
+++ b/lib/md5.c
@@ -1,176 +1,312 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-
 /*
- *  this file comes from  https://github.com/pod32g/MD5/blob/master/md5.c
+ * This file is adapted from PolarSSL 1.3.19 (GPL)
 */

-// Constants are the integer part of the sines of integers (in radians) * 2^32.
-const uint32_t k[64] = {
-0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee ,
-0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501 ,
-0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be ,
-0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821 ,
-0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa ,
-0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8 ,
-0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed ,
-0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a ,
-0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c ,
-0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70 ,
-0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05 ,
-0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665 ,
-0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039 ,
-0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1 ,
-0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1 ,
-0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 };
-
-// r specifies the per-round shift amounts
-const uint32_t r[] = {7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
-                      5,  9, 14, 20, 5,  9, 14, 20, 5,  9, 14, 20, 5,  9, 14, 20,
-                      4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
-                      6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21};
-
-// leftrotate function definition
-#define LEFTROTATE(x, c) (((x) << (c)) | ((x) >> (32 - (c))))
-
-void to_bytes(uint32_t val, uint8_t *bytes)
-{
-    bytes[0] = (uint8_t) val;
-    bytes[1] = (uint8_t) (val >> 8);
-    bytes[2] = (uint8_t) (val >> 16);
-    bytes[3] = (uint8_t) (val >> 24);
-}
-
-uint32_t to_int32(const uint8_t *bytes)
-{
-    return (uint32_t) bytes[0]
-        | ((uint32_t) bytes[1] << 8)
-        | ((uint32_t) bytes[2] << 16)
-        | ((uint32_t) bytes[3] << 24);
-}
-
-void md5(const uint8_t *initial_msg, size_t initial_len, uint8_t *digest) {
-
-    // These vars will contain the hash
-    uint32_t h0, h1, h2, h3;
-
-    // Message (to prepare)
-    uint8_t *msg = NULL;
-
-    size_t new_len, offset;
-    uint32_t w[16];
-    uint32_t a, b, c, d, i, f, g, temp;
-
-    // Initialize variables - simple count in nibbles:
-    h0 = 0x67452301;
-    h1 = 0xefcdab89;
-    h2 = 0x98badcfe;
-    h3 = 0x10325476;
-
-    //Pre-processing:
-    //append "1" bit to message
-    //append "0" bits until message length in bits ≡ 448 (mod 512)
-    //append length mod (2^64) to message
-
-    for (new_len = initial_len + 1; new_len % (512/8) != 448/8; new_len++)
-        ;
-
-	uint8_t buf[new_len + 8];
-    msg = buf;//(uint8_t*)malloc(new_len + 8);
-    memcpy(msg, initial_msg, initial_len);
-    msg[initial_len] = 0x80; // append the "1" bit; most significant bit is "first"
-    for (offset = initial_len + 1; offset < new_len; offset++)
-        msg[offset] = 0; // append "0" bits
-
-    // append the len in bits at the end of the buffer.
-    to_bytes(initial_len*8, msg + new_len);
-    // initial_len>>29 == initial_len*8>>32, but avoids overflow.
-    to_bytes(initial_len>>29, msg + new_len + 4);
-
-    // Process the message in successive 512-bit chunks:
-    //for each 512-bit chunk of message:
-    for(offset=0; offset<new_len; offset += (512/8)) {
-
-        // break chunk into sixteen 32-bit words w[j], 0 ≤ j ≤ 15
-        for (i = 0; i < 16; i++)
-            w[i] = to_int32(msg + offset + i*4);
-
-        // Initialize hash value for this chunk:
-        a = h0;
-        b = h1;
-        c = h2;
-        d = h3;
-
-        // Main loop:
-        for(i = 0; i<64; i++) {
-
-            if (i < 16) {
-                f = (b & c) | ((~b) & d);
-                g = i;
-            } else if (i < 32) {
-                f = (d & b) | ((~d) & c);
-                g = (5*i + 1) % 16;
-            } else if (i < 48) {
-                f = b ^ c ^ d;
-                g = (3*i + 5) % 16;
-            } else {
-                f = c ^ (b | (~d));
-                g = (7*i) % 16;
-            }
-
-            temp = d;
-            d = c;
-            c = b;
-            b = b + LEFTROTATE((a + f + k[i] + w[g]), r[i]);
-            a = temp;
-
-        }
-
-        // Add this chunk's hash to result so far:
-        h0 += a;
-        h1 += b;
-        h2 += c;
-        h3 += d;
-
-    }
-
-    // cleanup
-    //free(msg);
-
-    //var char digest[16] := h0 append h1 append h2 append h3 //(Output is in little-endian)
-    to_bytes(h0, digest);
-    to_bytes(h1, digest + 4);
-    to_bytes(h2, digest + 8);
-    to_bytes(h3, digest + 12);
-}
 /*
-int main(int argc, char **argv) {
-    char *msg;
-    size_t len;
-    int i;
-    uint8_t result[16];
+ *  RFC 1321 compliant MD5 implementation
+ *
+ *  Copyright (C) 2006-2014, ARM Limited, All Rights Reserved
+ *
+ *  This file is part of mbed TLS (https://tls.mbed.org)
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+/*
+ *  The MD5 algorithm was designed by Ron Rivest in 1991.
+ *
+ *  http://www.ietf.org/rfc/rfc1321.txt
+ */

-    if (argc < 2) {
-        printf("usage: %s 'string'\n", argv[0]);
-        return 1;
-    }
-    msg = argv[1];
+#include <string.h>
+#include <stddef.h>
+#include <stdint.h>

-    len = strlen(msg);
-
-    // benchmark
-    for (i = 0; i < 1000000; i++) {
-        md5((uint8_t*)msg, len, result);
-    }
-
-    // display result
-    for (i = 0; i < 16; i++)
-        printf("%2.2x", result[i]);
-    puts("");
-
-    return 0;
+typedef struct
+{
+    uint32_t total[2];          /*!< number of bytes processed  */
+    uint32_t state[4];          /*!< intermediate digest state  */
+    unsigned char buffer[64];   /*!< data block being processed */
+}
+md5_context;
+
+/* Implementation that should never be optimized out by the compiler */
+static void polarssl_zeroize( void *v, size_t n ) {
+    volatile unsigned char *p = (unsigned char *) v; while( n-- ) *p++ = 0;
+}
+
+/*
+ * 32-bit integer manipulation macros (little endian)
+ */
+#ifndef GET_UINT32_LE
+#define GET_UINT32_LE(n,b,i)                            \
+{                                                       \
+    (n) = ( (uint32_t) (b)[(i)    ]       )             \
+        | ( (uint32_t) (b)[(i) + 1] <<  8 )             \
+        | ( (uint32_t) (b)[(i) + 2] << 16 )             \
+        | ( (uint32_t) (b)[(i) + 3] << 24 );            \
+}
+#endif
+
+#ifndef PUT_UINT32_LE
+#define PUT_UINT32_LE(n,b,i)                                    \
+{                                                               \
+    (b)[(i)    ] = (unsigned char) ( ( (n)       ) & 0xFF );    \
+    (b)[(i) + 1] = (unsigned char) ( ( (n) >>  8 ) & 0xFF );    \
+    (b)[(i) + 2] = (unsigned char) ( ( (n) >> 16 ) & 0xFF );    \
+    (b)[(i) + 3] = (unsigned char) ( ( (n) >> 24 ) & 0xFF );    \
+}
+#endif
+
+void md5_init( md5_context *ctx )
+{
+    memset( ctx, 0, sizeof( md5_context ) );
+}
+
+void md5_free( md5_context *ctx )
+{
+    if( ctx == NULL )
+        return;
+
+    polarssl_zeroize( ctx, sizeof( md5_context ) );
+}
+
+/*
+ * MD5 context setup
+ */
+void md5_starts( md5_context *ctx )
+{
+    ctx->total[0] = 0;
+    ctx->total[1] = 0;
+
+    ctx->state[0] = 0x67452301;
+    ctx->state[1] = 0xEFCDAB89;
+    ctx->state[2] = 0x98BADCFE;
+    ctx->state[3] = 0x10325476;
+}
+
+void md5_process( md5_context *ctx, const unsigned char data[64] )
+{
+    uint32_t X[16], A, B, C, D;
+
+    GET_UINT32_LE( X[ 0], data,  0 );
+    GET_UINT32_LE( X[ 1], data,  4 );
+    GET_UINT32_LE( X[ 2], data,  8 );
+    GET_UINT32_LE( X[ 3], data, 12 );
+    GET_UINT32_LE( X[ 4], data, 16 );
+    GET_UINT32_LE( X[ 5], data, 20 );
+    GET_UINT32_LE( X[ 6], data, 24 );
+    GET_UINT32_LE( X[ 7], data, 28 );
+    GET_UINT32_LE( X[ 8], data, 32 );
+    GET_UINT32_LE( X[ 9], data, 36 );
+    GET_UINT32_LE( X[10], data, 40 );
+    GET_UINT32_LE( X[11], data, 44 );
+    GET_UINT32_LE( X[12], data, 48 );
+    GET_UINT32_LE( X[13], data, 52 );
+    GET_UINT32_LE( X[14], data, 56 );
+    GET_UINT32_LE( X[15], data, 60 );
+
+#define S(x,n) ((x << n) | ((x & 0xFFFFFFFF) >> (32 - n)))
+
+#define P(a,b,c,d,k,s,t)                                \
+{                                                       \
+    a += F(b,c,d) + X[k] + t; a = S(a,s) + b;           \
+}
+
+    A = ctx->state[0];
+    B = ctx->state[1];
+    C = ctx->state[2];
+    D = ctx->state[3];
+
+#define F(x,y,z) (z ^ (x & (y ^ z)))
+
+    P( A, B, C, D,  0,  7, 0xD76AA478 );
+    P( D, A, B, C,  1, 12, 0xE8C7B756 );
+    P( C, D, A, B,  2, 17, 0x242070DB );
+    P( B, C, D, A,  3, 22, 0xC1BDCEEE );
+    P( A, B, C, D,  4,  7, 0xF57C0FAF );
+    P( D, A, B, C,  5, 12, 0x4787C62A );
+    P( C, D, A, B,  6, 17, 0xA8304613 );
+    P( B, C, D, A,  7, 22, 0xFD469501 );
+    P( A, B, C, D,  8,  7, 0x698098D8 );
+    P( D, A, B, C,  9, 12, 0x8B44F7AF );
+    P( C, D, A, B, 10, 17, 0xFFFF5BB1 );
+    P( B, C, D, A, 11, 22, 0x895CD7BE );
+    P( A, B, C, D, 12,  7, 0x6B901122 );
+    P( D, A, B, C, 13, 12, 0xFD987193 );
+    P( C, D, A, B, 14, 17, 0xA679438E );
+    P( B, C, D, A, 15, 22, 0x49B40821 );
+
+#undef F
+
+#define F(x,y,z) (y ^ (z & (x ^ y)))
+
+    P( A, B, C, D,  1,  5, 0xF61E2562 );
+    P( D, A, B, C,  6,  9, 0xC040B340 );
+    P( C, D, A, B, 11, 14, 0x265E5A51 );
+    P( B, C, D, A,  0, 20, 0xE9B6C7AA );
+    P( A, B, C, D,  5,  5, 0xD62F105D );
+    P( D, A, B, C, 10,  9, 0x02441453 );
+    P( C, D, A, B, 15, 14, 0xD8A1E681 );
+    P( B, C, D, A,  4, 20, 0xE7D3FBC8 );
+    P( A, B, C, D,  9,  5, 0x21E1CDE6 );
+    P( D, A, B, C, 14,  9, 0xC33707D6 );
+    P( C, D, A, B,  3, 14, 0xF4D50D87 );
+    P( B, C, D, A,  8, 20, 0x455A14ED );
+    P( A, B, C, D, 13,  5, 0xA9E3E905 );
+    P( D, A, B, C,  2,  9, 0xFCEFA3F8 );
+    P( C, D, A, B,  7, 14, 0x676F02D9 );
+    P( B, C, D, A, 12, 20, 0x8D2A4C8A );
+
+#undef F
+
+#define F(x,y,z) (x ^ y ^ z)
+
+    P( A, B, C, D,  5,  4, 0xFFFA3942 );
+    P( D, A, B, C,  8, 11, 0x8771F681 );
+    P( C, D, A, B, 11, 16, 0x6D9D6122 );
+    P( B, C, D, A, 14, 23, 0xFDE5380C );
+    P( A, B, C, D,  1,  4, 0xA4BEEA44 );
+    P( D, A, B, C,  4, 11, 0x4BDECFA9 );
+    P( C, D, A, B,  7, 16, 0xF6BB4B60 );
+    P( B, C, D, A, 10, 23, 0xBEBFBC70 );
+    P( A, B, C, D, 13,  4, 0x289B7EC6 );
+    P( D, A, B, C,  0, 11, 0xEAA127FA );
+    P( C, D, A, B,  3, 16, 0xD4EF3085 );
+    P( B, C, D, A,  6, 23, 0x04881D05 );
+    P( A, B, C, D,  9,  4, 0xD9D4D039 );
+    P( D, A, B, C, 12, 11, 0xE6DB99E5 );
+    P( C, D, A, B, 15, 16, 0x1FA27CF8 );
+    P( B, C, D, A,  2, 23, 0xC4AC5665 );
+
+#undef F
+
+#define F(x,y,z) (y ^ (x | ~z))
+
+    P( A, B, C, D,  0,  6, 0xF4292244 );
+    P( D, A, B, C,  7, 10, 0x432AFF97 );
+    P( C, D, A, B, 14, 15, 0xAB9423A7 );
+    P( B, C, D, A,  5, 21, 0xFC93A039 );
+    P( A, B, C, D, 12,  6, 0x655B59C3 );
+    P( D, A, B, C,  3, 10, 0x8F0CCC92 );
+    P( C, D, A, B, 10, 15, 0xFFEFF47D );
+    P( B, C, D, A,  1, 21, 0x85845DD1 );
+    P( A, B, C, D,  8,  6, 0x6FA87E4F );
+    P( D, A, B, C, 15, 10, 0xFE2CE6E0 );
+    P( C, D, A, B,  6, 15, 0xA3014314 );
+    P( B, C, D, A, 13, 21, 0x4E0811A1 );
+    P( A, B, C, D,  4,  6, 0xF7537E82 );
+    P( D, A, B, C, 11, 10, 0xBD3AF235 );
+    P( C, D, A, B,  2, 15, 0x2AD7D2BB );
+    P( B, C, D, A,  9, 21, 0xEB86D391 );
+
+#undef F
+
+    ctx->state[0] += A;
+    ctx->state[1] += B;
+    ctx->state[2] += C;
+    ctx->state[3] += D;
+}
+
+/*
+ * MD5 process buffer
+ */
+void md5_update( md5_context *ctx, const unsigned char *input, size_t ilen )
+{
+    size_t fill;
+    uint32_t left;
+
+    if( ilen == 0 )
+        return;
+
+    left = ctx->total[0] & 0x3F;
+    fill = 64 - left;
+
+    ctx->total[0] += (uint32_t) ilen;
+    ctx->total[0] &= 0xFFFFFFFF;
+
+    if( ctx->total[0] < (uint32_t) ilen )
+        ctx->total[1]++;
+
+    if( left && ilen >= fill )
+    {
+        memcpy( (void *) (ctx->buffer + left), input, fill );
+        md5_process( ctx, ctx->buffer );
+        input += fill;
+        ilen  -= fill;
+        left = 0;
+    }
+
+    while( ilen >= 64 )
+    {
+        md5_process( ctx, input );
+        input += 64;
+        ilen  -= 64;
+    }
+
+    if( ilen > 0 )
+    {
+        memcpy( (void *) (ctx->buffer + left), input, ilen );
+    }
+}
+
+static const unsigned char md5_padding[64] =
+{
+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/*
+ * MD5 final digest
+ */
+void md5_finish( md5_context *ctx, unsigned char output[16] )
+{
+    uint32_t last, padn;
+    uint32_t high, low;
+    unsigned char msglen[8];
+
+    high = ( ctx->total[0] >> 29 )
+         | ( ctx->total[1] <<  3 );
+    low  = ( ctx->total[0] <<  3 );
+
+    PUT_UINT32_LE( low,  msglen, 0 );
+    PUT_UINT32_LE( high, msglen, 4 );
+
+    last = ctx->total[0] & 0x3F;
+    padn = ( last < 56 ) ? ( 56 - last ) : ( 120 - last );
+
+    md5_update( ctx, md5_padding, padn );
+    md5_update( ctx, msglen, 8 );
+
+    PUT_UINT32_LE( ctx->state[0], output,  0 );
+    PUT_UINT32_LE( ctx->state[1], output,  4 );
+    PUT_UINT32_LE( ctx->state[2], output,  8 );
+    PUT_UINT32_LE( ctx->state[3], output, 12 );
+}
+
+/*
+ * output = MD5( input buffer )
+ */
+void md5( const unsigned char *input, size_t ilen, unsigned char output[16] )
+{
+    md5_context ctx;
+
+    md5_init( &ctx );
+    md5_starts( &ctx );
+    md5_update( &ctx, input, ilen );
+    md5_finish( &ctx, output );
+    md5_free( &ctx );
 }
-*/
--- a/lib/md5.h
+++ b/lib/md5.h
@@ -1,9 +1,7 @@
 #ifndef _MD5_H_
 #define _MD5_H_
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
 #include <stdint.h>
+#include <stddef.h>

 void md5(const uint8_t *initial_msg, size_t initial_len, uint8_t *digest);

--- a/lib/sha1.c
+++ b/lib/sha1.c
@@ -0,0 +1,345 @@
+/*
+ * This file is adapted from PolarSSL 1.3.19 (GPL)
+ */
+
+/*
+ *  FIPS-180-1 compliant SHA-1 implementation
+ *
+ *  Copyright (C) 2006-2014, ARM Limited, All Rights Reserved
+ *
+ *  This file is part of mbed TLS (https://tls.mbed.org)
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+/*
+ *  The SHA-1 standard was published by NIST in 1993.
+ *
+ *  http://www.itl.nist.gov/fipspubs/fip180-1.htm
+ */
+
+#include <string.h>
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct
+{
+    uint32_t total[2];          /*!< number of bytes processed  */
+    uint32_t state[5];          /*!< intermediate digest state  */
+    unsigned char buffer[64];   /*!< data block being processed */
+}
+sha1_context;
+
+/* Implementation that should never be optimized out by the compiler */
+static void polarssl_zeroize( void *v, size_t n ) {
+    volatile unsigned char *p = (unsigned char *) v; while( n-- ) *p++ = 0;
+}
+
+/*
+ * 32-bit integer manipulation macros (big endian)
+ */
+#ifndef GET_UINT32_BE
+#define GET_UINT32_BE(n,b,i)                            \
+{                                                       \
+    (n) = ( (uint32_t) (b)[(i)    ] << 24 )             \
+        | ( (uint32_t) (b)[(i) + 1] << 16 )             \
+        | ( (uint32_t) (b)[(i) + 2] <<  8 )             \
+        | ( (uint32_t) (b)[(i) + 3]       );            \
+}
+#endif
+
+#ifndef PUT_UINT32_BE
+#define PUT_UINT32_BE(n,b,i)                            \
+{                                                       \
+    (b)[(i)    ] = (unsigned char) ( (n) >> 24 );       \
+    (b)[(i) + 1] = (unsigned char) ( (n) >> 16 );       \
+    (b)[(i) + 2] = (unsigned char) ( (n) >>  8 );       \
+    (b)[(i) + 3] = (unsigned char) ( (n)       );       \
+}
+#endif
+
+void sha1_init( sha1_context *ctx )
+{
+    memset( ctx, 0, sizeof( sha1_context ) );
+}
+
+void sha1_free( sha1_context *ctx )
+{
+    if( ctx == NULL )
+        return;
+
+    polarssl_zeroize( ctx, sizeof( sha1_context ) );
+}
+
+/*
+ * SHA-1 context setup
+ */
+void sha1_starts( sha1_context *ctx )
+{
+    ctx->total[0] = 0;
+    ctx->total[1] = 0;
+
+    ctx->state[0] = 0x67452301;
+    ctx->state[1] = 0xEFCDAB89;
+    ctx->state[2] = 0x98BADCFE;
+    ctx->state[3] = 0x10325476;
+    ctx->state[4] = 0xC3D2E1F0;
+}
+
+void sha1_process( sha1_context *ctx, const unsigned char data[64] )
+{
+    uint32_t temp, W[16], A, B, C, D, E;
+
+    GET_UINT32_BE( W[ 0], data,  0 );
+    GET_UINT32_BE( W[ 1], data,  4 );
+    GET_UINT32_BE( W[ 2], data,  8 );
+    GET_UINT32_BE( W[ 3], data, 12 );
+    GET_UINT32_BE( W[ 4], data, 16 );
+    GET_UINT32_BE( W[ 5], data, 20 );
+    GET_UINT32_BE( W[ 6], data, 24 );
+    GET_UINT32_BE( W[ 7], data, 28 );
+    GET_UINT32_BE( W[ 8], data, 32 );
+    GET_UINT32_BE( W[ 9], data, 36 );
+    GET_UINT32_BE( W[10], data, 40 );
+    GET_UINT32_BE( W[11], data, 44 );
+    GET_UINT32_BE( W[12], data, 48 );
+    GET_UINT32_BE( W[13], data, 52 );
+    GET_UINT32_BE( W[14], data, 56 );
+    GET_UINT32_BE( W[15], data, 60 );
+
+#define S(x,n) ((x << n) | ((x & 0xFFFFFFFF) >> (32 - n)))
+
+#define R(t)                                            \
+(                                                       \
+    temp = W[( t -  3 ) & 0x0F] ^ W[( t - 8 ) & 0x0F] ^ \
+           W[( t - 14 ) & 0x0F] ^ W[  t       & 0x0F],  \
+    ( W[t & 0x0F] = S(temp,1) )                         \
+)
+
+#define P(a,b,c,d,e,x)                                  \
+{                                                       \
+    e += S(a,5) + F(b,c,d) + K + x; b = S(b,30);        \
+}
+
+    A = ctx->state[0];
+    B = ctx->state[1];
+    C = ctx->state[2];
+    D = ctx->state[3];
+    E = ctx->state[4];
+
+#define F(x,y,z) (z ^ (x & (y ^ z)))
+#define K 0x5A827999
+
+    P( A, B, C, D, E, W[0]  );
+    P( E, A, B, C, D, W[1]  );
+    P( D, E, A, B, C, W[2]  );
+    P( C, D, E, A, B, W[3]  );
+    P( B, C, D, E, A, W[4]  );
+    P( A, B, C, D, E, W[5]  );
+    P( E, A, B, C, D, W[6]  );
+    P( D, E, A, B, C, W[7]  );
+    P( C, D, E, A, B, W[8]  );
+    P( B, C, D, E, A, W[9]  );
+    P( A, B, C, D, E, W[10] );
+    P( E, A, B, C, D, W[11] );
+    P( D, E, A, B, C, W[12] );
+    P( C, D, E, A, B, W[13] );
+    P( B, C, D, E, A, W[14] );
+    P( A, B, C, D, E, W[15] );
+    P( E, A, B, C, D, R(16) );
+    P( D, E, A, B, C, R(17) );
+    P( C, D, E, A, B, R(18) );
+    P( B, C, D, E, A, R(19) );
+
+#undef K
+#undef F
+
+#define F(x,y,z) (x ^ y ^ z)
+#define K 0x6ED9EBA1
+
+    P( A, B, C, D, E, R(20) );
+    P( E, A, B, C, D, R(21) );
+    P( D, E, A, B, C, R(22) );
+    P( C, D, E, A, B, R(23) );
+    P( B, C, D, E, A, R(24) );
+    P( A, B, C, D, E, R(25) );
+    P( E, A, B, C, D, R(26) );
+    P( D, E, A, B, C, R(27) );
+    P( C, D, E, A, B, R(28) );
+    P( B, C, D, E, A, R(29) );
+    P( A, B, C, D, E, R(30) );
+    P( E, A, B, C, D, R(31) );
+    P( D, E, A, B, C, R(32) );
+    P( C, D, E, A, B, R(33) );
+    P( B, C, D, E, A, R(34) );
+    P( A, B, C, D, E, R(35) );
+    P( E, A, B, C, D, R(36) );
+    P( D, E, A, B, C, R(37) );
+    P( C, D, E, A, B, R(38) );
+    P( B, C, D, E, A, R(39) );
+
+#undef K
+#undef F
+
+#define F(x,y,z) ((x & y) | (z & (x | y)))
+#define K 0x8F1BBCDC
+
+    P( A, B, C, D, E, R(40) );
+    P( E, A, B, C, D, R(41) );
+    P( D, E, A, B, C, R(42) );
+    P( C, D, E, A, B, R(43) );
+    P( B, C, D, E, A, R(44) );
+    P( A, B, C, D, E, R(45) );
+    P( E, A, B, C, D, R(46) );
+    P( D, E, A, B, C, R(47) );
+    P( C, D, E, A, B, R(48) );
+    P( B, C, D, E, A, R(49) );
+    P( A, B, C, D, E, R(50) );
+    P( E, A, B, C, D, R(51) );
+    P( D, E, A, B, C, R(52) );
+    P( C, D, E, A, B, R(53) );
+    P( B, C, D, E, A, R(54) );
+    P( A, B, C, D, E, R(55) );
+    P( E, A, B, C, D, R(56) );
+    P( D, E, A, B, C, R(57) );
+    P( C, D, E, A, B, R(58) );
+    P( B, C, D, E, A, R(59) );
+
+#undef K
+#undef F
+
+#define F(x,y,z) (x ^ y ^ z)
+#define K 0xCA62C1D6
+
+    P( A, B, C, D, E, R(60) );
+    P( E, A, B, C, D, R(61) );
+    P( D, E, A, B, C, R(62) );
+    P( C, D, E, A, B, R(63) );
+    P( B, C, D, E, A, R(64) );
+    P( A, B, C, D, E, R(65) );
+    P( E, A, B, C, D, R(66) );
+    P( D, E, A, B, C, R(67) );
+    P( C, D, E, A, B, R(68) );
+    P( B, C, D, E, A, R(69) );
+    P( A, B, C, D, E, R(70) );
+    P( E, A, B, C, D, R(71) );
+    P( D, E, A, B, C, R(72) );
+    P( C, D, E, A, B, R(73) );
+    P( B, C, D, E, A, R(74) );
+    P( A, B, C, D, E, R(75) );
+    P( E, A, B, C, D, R(76) );
+    P( D, E, A, B, C, R(77) );
+    P( C, D, E, A, B, R(78) );
+    P( B, C, D, E, A, R(79) );
+
+#undef K
+#undef F
+
+    ctx->state[0] += A;
+    ctx->state[1] += B;
+    ctx->state[2] += C;
+    ctx->state[3] += D;
+    ctx->state[4] += E;
+}
+
+/*
+ * SHA-1 process buffer
+ */
+void sha1_update( sha1_context *ctx, const unsigned char *input, size_t ilen )
+{
+    size_t fill;
+    uint32_t left;
+
+    if( ilen == 0 )
+        return;
+
+    left = ctx->total[0] & 0x3F;
+    fill = 64 - left;
+
+    ctx->total[0] += (uint32_t) ilen;
+    ctx->total[0] &= 0xFFFFFFFF;
+
+    if( ctx->total[0] < (uint32_t) ilen )
+        ctx->total[1]++;
+
+    if( left && ilen >= fill )
+    {
+        memcpy( (void *) (ctx->buffer + left), input, fill );
+        sha1_process( ctx, ctx->buffer );
+        input += fill;
+        ilen  -= fill;
+        left = 0;
+    }
+
+    while( ilen >= 64 )
+    {
+        sha1_process( ctx, input );
+        input += 64;
+        ilen  -= 64;
+    }
+
+    if( ilen > 0 )
+        memcpy( (void *) (ctx->buffer + left), input, ilen );
+}
+
+static const unsigned char sha1_padding[64] =
+{
+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/*
+ * SHA-1 final digest
+ */
+void sha1_finish( sha1_context *ctx, unsigned char output[20] )
+{
+    uint32_t last, padn;
+    uint32_t high, low;
+    unsigned char msglen[8];
+
+    high = ( ctx->total[0] >> 29 )
+         | ( ctx->total[1] <<  3 );
+    low  = ( ctx->total[0] <<  3 );
+
+    PUT_UINT32_BE( high, msglen, 0 );
+    PUT_UINT32_BE( low,  msglen, 4 );
+
+    last = ctx->total[0] & 0x3F;
+    padn = ( last < 56 ) ? ( 56 - last ) : ( 120 - last );
+
+    sha1_update( ctx, sha1_padding, padn );
+    sha1_update( ctx, msglen, 8 );
+
+    PUT_UINT32_BE( ctx->state[0], output,  0 );
+    PUT_UINT32_BE( ctx->state[1], output,  4 );
+    PUT_UINT32_BE( ctx->state[2], output,  8 );
+    PUT_UINT32_BE( ctx->state[3], output, 12 );
+    PUT_UINT32_BE( ctx->state[4], output, 16 );
+}
+
+/*
+ * output = SHA-1( input buffer )
+ */
+void sha1( const unsigned char *input, size_t ilen, unsigned char output[20] )
+{
+    sha1_context ctx;
+
+    sha1_init( &ctx );
+    sha1_starts( &ctx );
+    sha1_update( &ctx, input, ilen );
+    sha1_finish( &ctx, output );
+    sha1_free( &ctx );
+}
--- a/log.h
+++ b/log.h
@@ -2,56 +2,8 @@
 #ifndef _LOG_MYLOG_H_
 #define _LOG_MYLOG_H_

-#include<stdio.h>
-#include<string.h>
-#include<sys/socket.h>
-#include<arpa/inet.h>
-#include<stdlib.h>
-#include<getopt.h>
-#include <unistd.h>
-#include<errno.h>

-#include <fcntl.h>
-//#include"aes.h"
-
-#include <sys/epoll.h>
-#include <sys/wait.h>
-
-#include<map>
-#include<string>
-#include<vector>
-
-
-#include <sys/socket.h>    //for socket ofcourse
-#include <sys/types.h>
-#include <stdlib.h> //for exit(0);
-#include <errno.h> //For errno - the error number
-#include <netinet/tcp.h>   //Provides declarations for tcp header
-#include <netinet/udp.h>
-#include <netinet/ip.h>    //Provides declarations for ip header
-#include <netinet/if_ether.h>
-#include <arpa/inet.h>
-#include <fcntl.h>
-#include <byteswap.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <arpa/inet.h>
-#include <linux/if_ether.h>
-#include <linux/filter.h>
-
-#include <sys/time.h>
-#include <time.h>
-
-#include <sys/timerfd.h>
-#include <set>
-#include "encrypt.h"
-#include <inttypes.h>
-
-#include <sys/ioctl.h>
-#include <netinet/in.h>
-#include <net/if.h>
-#include <arpa/inet.h>
-#include <stdarg.h>
+#include<common.h>

 using namespace std;

--- a/main.cpp
+++ b/main.cpp
--- a/52
+++ b/52
@@ -1,30 +1,51 @@
-cc_cross=/home/wangyu/OpenWrt-SDK-ar71xx-for-linux-x86_64-gcc-4.8-linaro_uClibc-0.9.33.2/staging_dir/toolchain-mips_34kc_gcc-4.8-linaro_uClibc-0.9.33.2/bin/mips-openwrt-linux-g++
+cc_cross=/home/wangyu/Desktop/arm-2014.05/bin/arm-none-linux-gnueabi-g++
 cc_local=g++
-cc_ar71xx=/home/wangyu/OpenWrt-SDK-ar71xx-for-linux-x86_64-gcc-4.8-linaro_uClibc-0.9.33.2/staging_dir/toolchain-mips_34kc_gcc-4.8-linaro_uClibc-0.9.33.2/bin/mips-openwrt-linux-g++
-cc_bcm2708=/home/wangyu/raspberry/tools/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian/bin/arm-linux-gnueabihf-g++ 
+cc_mips34kc=/toolchains/OpenWrt-SDK-ar71xx-for-linux-x86_64-gcc-4.8-linaro_uClibc-0.9.33.2/staging_dir/toolchain-mips_34kc_gcc-4.8-linaro_uClibc-0.9.33.2/bin/mips-openwrt-linux-g++
+cc_arm= /toolchains/arm-2014.05/bin/arm-none-linux-gnueabi-g++
+#cc_bcm2708=/home/wangyu/raspberry/tools/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian/bin/arm-linux-gnueabihf-g++ 
 FLAGS= -std=c++11 -Wall -Wextra -Wno-unused-variable -Wno-unused-parameter -Wno-missing-field-initializers
-SOURCES=main.cpp lib/aes.c lib/md5.c encrypt.cpp log.cpp network.cpp common.cpp
+
+SOURCES=main.cpp lib/aes.c lib/md5.c encrypt.cpp log.cpp network.cpp common.cpp  -lpthread
+SOURCES_AES_ACC=$(filter-out lib/aes.c,$(SOURCES)) $(wildcard lib/aes_acc/aes*.c)
+
 NAME=udp2raw
-TAR=${NAME}_binaries.tar.gz ${NAME}_amd64  ${NAME}_x86  ${NAME}_ar71xx ${NAME}_bcm2708
+TARGETS=amd64 mips34kc arm amd64_hw_aes arm_asm_aes mips34kc_asm_aes x86 x86_asm_aes
+TAR=${NAME}_binaries.tar.gz `echo ${TARGETS}|sed -r 's/([^ ]+)/udp2raw_\1/g'`

 all:
 	rm -f ${NAME}
-	${cc_local}   -o ${NAME}          -I. ${SOURCES} ${FLAGS} -lrt  -static -O3
+	${cc_local}   -o ${NAME}          -I. ${SOURCES} ${FLAGS} -lrt -ggdb -static -O3
 fast:
 	rm -f ${NAME}
-	${cc_local}   -o ${NAME}          -I. ${SOURCES} ${FLAGS} -lrt
+	${cc_local}   -o ${NAME}          -I. ${SOURCES} ${FLAGS} -lrt -ggdb
 debug:
 	rm -f ${NAME}
 	${cc_local}   -o ${NAME}          -I. ${SOURCES} ${FLAGS} -lrt -Wformat-nonliteral -D MY_DEBUG 
+debug2:
+	rm -f ${NAME}
+	${cc_local}   -o ${NAME}          -I. ${SOURCES} ${FLAGS} -lrt -Wformat-nonliteral -ggdb

-ar71xx: 
-	${cc_ar71xx}  -o ${NAME}_ar71xx   -I. ${SOURCES} ${FLAGS} -lrt -lgcc_eh -static -O3
-bcm2708:
-	${cc_bcm2708} -o ${NAME}_bcm2708  -I. ${SOURCES} ${FLAGS} -lrt -static -O3
+mips34kc: 
+	${cc_mips34kc}  -o ${NAME}_$@   -I. ${SOURCES} ${FLAGS} -lrt -lgcc_eh -static -O3
+
+mips34kc_asm_aes: 
+	${cc_mips34kc}  -o ${NAME}_$@   -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -lgcc_eh -static -O3 lib/aes_acc/asm/mips_be.S
+
+#bcm2708:
+#	${cc_bcm2708} -o ${NAME}_bcm2708  -I. ${SOURCES} ${FLAGS} -lrt -static -O3
 amd64:
-	${cc_local}   -o ${NAME}_amd64    -I. ${SOURCES} ${FLAGS} -lrt -static -O3
+	${cc_local}   -o ${NAME}_$@    -I. ${SOURCES} ${FLAGS} -lrt -static -O3
+amd64_hw_aes:
+	${cc_local}   -o ${NAME}_$@   -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3 lib/aes_acc/asm/x64.S
 x86:
-	${cc_local}   -o ${NAME}_x86      -I. ${SOURCES} ${FLAGS} -lrt -m32 -static -O3
+	${cc_local}   -o ${NAME}_$@      -I. ${SOURCES} ${FLAGS} -lrt -static -O3 -m32
+x86_asm_aes:
+	${cc_local}   -o ${NAME}_$@    -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3 -m32 lib/aes_acc/asm/x86.S
+arm:
+	${cc_arm}   -o ${NAME}_$@      -I. ${SOURCES} ${FLAGS} -lrt -static -O3
+
+arm_asm_aes:
+	${cc_arm}   -o ${NAME}_$@    -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3 lib/aes_acc/asm/arm.S

 cross:
 	${cc_cross}   -o ${NAME}_cross    -I. ${SOURCES} ${FLAGS} -lrt -O3
@@ -32,10 +53,13 @@ cross:
 cross2:
 	${cc_cross}   -o ${NAME}_cross    -I. ${SOURCES} ${FLAGS} -lrt -static -lgcc_eh -O3   

+cross3:
+	${cc_cross}   -o ${NAME}_cross    -I. ${SOURCES} ${FLAGS} -lrt -static -O3

-release: amd64 x86 ar71xx bcm2708
+release: ${TARGETS} 
 	tar -zcvf ${TAR}

 clean:	
 	rm -f ${TAR}
+	rm -f udp2raw udp2raw_cross udp2raw_cmake
 	
--- a/network.cpp
+++ b/network.cpp
@@ -12,7 +12,7 @@ int raw_recv_fd=-1;
 int raw_send_fd=-1;
 u32_t link_level_header_len=0;//set it to 14 if SOCK_RAW is used in socket(PF_PACKET, SOCK_RAW, htons(ETH_P_IP));

-int seq_mode=2;
+int seq_mode=1;

 int filter_port=-1;

@@ -21,13 +21,14 @@ int disable_bpf_filter=0;  //for test only,most time no need to disable this
 u32_t bind_address_uint32=0;

 int lower_level=0;
+int lower_level_manual=0;
 int ifindex=-1;
 char if_name[100]="";

 unsigned short g_ip_id_counter=0;

-unsigned char oppsite_hw_addr[6]=
-    {0xff,0xff,0xff,0xff,0xff,0xff};
+unsigned char dest_hw_addr[sizeof(sockaddr_ll::sll_addr)]=
+    {0xff,0xff,0xff,0xff,0xff,0xff,0,0};
 //{0x00,0x23,0x45,0x67,0x89,0xb9};

 struct sock_filter code_tcp_old[] = {
@@ -46,8 +47,8 @@ struct sock_filter code_tcp_old[] = {
 		{ 0x6, 0, 0, 0x00000000 },//12
 };
 struct sock_filter code_tcp[] = {
-{ 0x5, 0, 0, 0x00000001 },//0    //jump to 2,dirty hack from tcpdump -d's output
-{ 0x5, 0, 0, 0x00000000 },//1
+//{ 0x5, 0, 0, 0x00000001 },//0    //jump to 2,dirty hack from tcpdump -d's output
+//{ 0x5, 0, 0, 0x00000000 },//1
 { 0x30, 0, 0, 0x00000009 },//2
 { 0x15, 0, 6, 0x00000006 },//3
 { 0x28, 0, 0, 0x00000006 },//4
@@ -58,11 +59,11 @@ struct sock_filter code_tcp[] = {
 { 0x6, 0, 0, 0x0000ffff },//9
 { 0x6, 0, 0, 0x00000000 },//10
 };
-int code_tcp_port_index=8;
+int code_tcp_port_index=6;

 struct sock_filter code_udp[] = {
-{ 0x5, 0, 0, 0x00000001 },
-{ 0x5, 0, 0, 0x00000000 },
+//{ 0x5, 0, 0, 0x00000001 },
+//{ 0x5, 0, 0, 0x00000000 },
 { 0x30, 0, 0, 0x00000009 },
 { 0x15, 0, 6, 0x00000011 },
 { 0x28, 0, 0, 0x00000006 },
@@ -73,10 +74,10 @@ struct sock_filter code_udp[] = {
 { 0x6, 0, 0, 0x0000ffff },
 { 0x6, 0, 0, 0x00000000 },
 };
-int code_udp_port_index=8;
+int code_udp_port_index=6;
 struct sock_filter code_icmp[] = {
-{ 0x5, 0, 0, 0x00000001 },
-{ 0x5, 0, 0, 0x00000000 },
+//{ 0x5, 0, 0, 0x00000001 },
+//{ 0x5, 0, 0, 0x00000000 },
 { 0x30, 0, 0, 0x00000009 },
 { 0x15, 0, 1, 0x00000001 },
 { 0x6, 0, 0, 0x0000ffff },
@@ -194,7 +195,7 @@ int init_raw_socket()
 	        //perror("Failed to create raw_send_fd");
 	        myexit(1);
 	    }
-		init_ifindex(if_name);
+		//init_ifindex(if_name);

 	}

@@ -288,7 +289,7 @@ void remove_filter()
 		//exit(-1);
 	}
 }
-int init_ifindex(char * if_name)
+int init_ifindex(const char * if_name,int &index)
 {
 	struct ifreq ifr;
 	size_t if_name_len=strlen(if_name);
@@ -304,10 +305,230 @@ int init_ifindex(char * if_name)
 		mylog(log_fatal,"SIOCGIFINDEX fail ,%s\n",strerror(errno));
 		myexit(-1);
 	}
-	ifindex=ifr.ifr_ifindex;
-	mylog(log_info,"ifname:%s  ifindex:%d\n",if_name,ifindex);
+	index=ifr.ifr_ifindex;
+	mylog(log_info,"ifname:%s  ifindex:%d\n",if_name,index);
 	return 0;
 }
+bool interface_has_arp(const char * interface) {
+    struct ifreq ifr;
+   // int sock = socket(PF_INET6, SOCK_DGRAM, IPPROTO_IP);
+    int sock=raw_send_fd;
+    memset(&ifr, 0, sizeof(ifr));
+    strcpy(ifr.ifr_name, interface);
+    if (ioctl(sock, SIOCGIFFLAGS, &ifr) < 0) {
+            //perror("SIOCGIFFLAGS");
+    		mylog(log_fatal,"ioctl(sock, SIOCGIFFLAGS, &ifr) failed for interface %s,errno %s\n",interface,strerror(errno));
+            myexit(-1);
+    }
+    //close(sock);
+    return !(ifr.ifr_flags & IFF_NOARP);
+}
+struct route_info_t
+{
+	string if_name;
+	u32_t dest;
+	u32_t mask;
+	u32_t gw;
+	u32_t flag;
+
+};
+int dest_idx=1;
+int gw_idx=2;
+int if_idx=0;
+int mask_idx=7;
+int flag_idx=3;
+vector<int> find_route_entry(const vector<route_info_t> &route_info_vec,u32_t ip)
+{
+	vector<int> res;
+	for(u32_t i=0;i<=32;i++)
+	{
+		u32_t mask=0xffffffff;
+		//mask >>=i;
+		//if(i==32) mask=0;  //why 0xffffffff>>32  equals 0xffffffff??
+
+		mask <<=i;
+		if(i==32) mask=0;
+		log_bare(log_debug,"(mask:%x)",mask);
+		for(u32_t j=0;j<route_info_vec.size();j++)
+		{
+			const route_info_t & info=route_info_vec[j];
+			if(info.mask!=mask)
+				continue;
+			log_bare(log_debug,"<<%d,%d>>",i,j);
+			if((info.dest&mask)==(ip&mask))
+			{
+				log_bare(log_debug,"found!");
+				res.push_back(j);
+			}
+		}
+		if(res.size()!=0)
+		{
+			return res;
+		}
+	}
+	return res;
+}
+int find_direct_dest(const vector<route_info_t> &route_info_vec,u32_t ip,u32_t &dest_ip,string &if_name)
+{
+	vector<int> res;
+	for(int i=0;i<1000;i++)
+	{
+		res=find_route_entry(route_info_vec,ip);
+		log_bare(log_debug,"<entry:%u>",(u32_t)res.size());
+		if(res.size()==0)
+		{
+			mylog(log_error,"cant find route entry\n");
+			return -1;
+		}
+		if(res.size()>1)
+		{
+			mylog(log_error,"found duplicated entries\n");
+			return -1;
+		}
+		if((route_info_vec[res[0]].flag&2)==0)
+		{
+			dest_ip=ip;
+			if_name=route_info_vec[res[0]].if_name;
+			return 0;
+		}
+		else
+		{
+			ip=route_info_vec[res[0]].gw;
+		}
+	}
+	mylog(log_error,"dead loop in find_direct_dest\n");
+	return -1;
+}
+struct arp_info_t
+{
+	u32_t ip;
+	string hw;
+	string if_name;
+};
+int arp_ip_idx=0;
+int arp_hw_idx=3;
+int arp_if_idx=5;
+
+
+int find_arp(const vector<arp_info_t> &arp_info_vec,u32_t ip,string if_name,string &hw)
+{
+	int pos=-1;
+	int count=0;
+	for(u32_t i=0;i<arp_info_vec.size();i++)
+	{
+		const arp_info_t & info=arp_info_vec[i];
+		if(info.if_name!=if_name) continue;
+		if(info.ip==ip)
+		{
+			count++;
+			pos=i;
+		}
+	}
+	if(count==0)
+	{
+		//mylog(log_warn,"cant find arp entry for %s %s,using 00:00:00:00:00:00\n",my_ntoa(ip),if_name.c_str());
+		//hw="00:00:00:00:00:00";
+		mylog(log_error,"cant find arp entry for %s %s\n",my_ntoa(ip),if_name.c_str());
+		return -1;
+	}
+	if(count>1)
+	{
+		mylog(log_error,"find multiple arp entry for %s %s\n",my_ntoa(ip),if_name.c_str());
+		return -1;
+	}
+	hw=arp_info_vec[pos].hw;
+	return 0;
+}
+int find_lower_level_info(u32_t ip,u32_t &dest_ip,string &if_name,string &hw)
+{
+	ip=htonl(ip);
+	if(ip==htonl(inet_addr("127.0.0.1")))
+	{
+		dest_ip=ntohl(ip);
+		if_name="lo";
+		hw="00:00:00:00:00:00";
+		return 0;
+	}
+
+	string route_file;
+	if(read_file("/proc/net/route",route_file)!=0) return -1;
+	string arp_file;
+	if(read_file("/proc/net/arp",arp_file)!=0) return -1;
+
+	log_bare(log_debug,"/proc/net/route:<<%s>>\n",route_file.c_str());
+	log_bare(log_debug,"/proc/net/arp:<<%s>>\n",route_file.c_str());
+
+	auto route_vec2=string_to_vec2(route_file.c_str());
+	vector<route_info_t> route_info_vec;
+	for(u32_t i=1;i<route_vec2.size();i++)
+	{
+		log_bare(log_debug,"<size:%u>",(u32_t)route_vec2[i].size());
+		if(route_vec2[i].size()!=11)
+		{
+			mylog(log_error,"route coloum %d !=11 \n",int(route_vec2[i].size()));
+			return -1;
+		}
+		route_info_t tmp;
+		tmp.if_name=route_vec2[i][if_idx];
+		if(hex_to_u32_with_endian(route_vec2[i][dest_idx],tmp.dest)!=0) return -1;
+		if(hex_to_u32_with_endian(route_vec2[i][gw_idx],tmp.gw)!=0) return -1;
+		if(hex_to_u32_with_endian(route_vec2[i][mask_idx],tmp.mask)!=0) return -1;
+		if(hex_to_u32(route_vec2[i][flag_idx],tmp.flag)!=0)return -1;
+		route_info_vec.push_back(tmp);
+		for(u32_t j=0;j<route_vec2[i].size();j++)
+		{
+			log_bare(log_debug,"<%s>",route_vec2[i][j].c_str());
+		}
+		log_bare(log_debug,"%s dest:%x mask:%x gw:%x flag:%x",tmp.if_name.c_str(),tmp.dest,tmp.mask,tmp.gw,tmp.flag);
+		log_bare(log_debug,"\n");
+	}
+
+	if(find_direct_dest(route_info_vec,ip,dest_ip,if_name)!=0)
+	{
+		mylog(log_error,"find_direct_dest failed for ip %s\n",my_ntoa(ntohl(ip)));
+		return -1;
+	}
+
+
+	log_bare(log_debug,"========\n");
+	auto arp_vec2=string_to_vec2(arp_file.c_str());
+	vector<arp_info_t> arp_info_vec;
+	for(u32_t i=1;i<arp_vec2.size();i++)
+	{
+		log_bare(log_debug,"<<arp_vec2[i].size(): %d>>",(int)arp_vec2[i].size());
+
+		for(u32_t j=0;j<arp_vec2[i].size();j++)
+		{
+			log_bare(log_debug,"<%s>",arp_vec2[i][j].c_str());
+		}
+		if(arp_vec2[i].size()!=6)
+		{
+			mylog(log_error,"arp coloum %d !=11 \n",int(arp_vec2[i].size()));
+			return -1;
+		}
+		arp_info_t tmp;
+		tmp.if_name=arp_vec2[i][arp_if_idx];
+		tmp.hw=arp_vec2[i][arp_hw_idx];
+		tmp.ip=htonl(inet_addr(arp_vec2[i][arp_ip_idx].c_str()));
+		arp_info_vec.push_back(tmp);
+		log_bare(log_debug,"\n");
+	}
+	if(!interface_has_arp(if_name.c_str()))
+	{
+		mylog(log_info,"%s is a noarp interface,using 00:00:00:00:00:00\n",if_name.c_str());
+		hw="00:00:00:00:00:00";
+	}
+	else if(find_arp(arp_info_vec,dest_ip,if_name,hw)!=0)
+	{
+		mylog(log_error,"find_arp failed for dest_ip %s ,if_name %s\n",my_ntoa(ntohl(ip)),if_name.c_str());
+		return -1;
+	}
+	//printf("%s\n",hw.c_str());
+
+	dest_ip=ntohl(dest_ip);
+	return 0;
+}
+

 int send_raw_ip(raw_info_t &raw_info,const char * payload,int payloadlen)
 {
@@ -364,14 +585,9 @@ int send_raw_ip(raw_info_t &raw_info,const char * payload,int payloadlen)
    else
    {

-    	struct sockaddr_ll addr={0};
-    	//memset(&addr,0,sizeof(addr));
+    	struct sockaddr_ll addr={0};  //={0} not necessary
+    	memcpy(&addr,&send_info.addr_ll,sizeof(addr));

-    	addr.sll_family=AF_PACKET;
-    	addr.sll_ifindex=ifindex;
-    	addr.sll_halen=ETHER_ADDR_LEN;
-    	addr.sll_protocol=htons(ETH_P_IP);
-    	memcpy(addr.sll_addr,oppsite_hw_addr,ETHER_ADDR_LEN);
    	ret = sendto(raw_send_fd, send_raw_ip_buf, ip_tot_len ,  0, (struct sockaddr *) &addr, sizeof (addr));
    }
    if(ret==-1)
@@ -455,10 +671,10 @@ int recv_raw_ip(raw_info_t &raw_info,char * &payload,int &payloadlen)
 	static char recv_raw_ip_buf[buf_len];

 	iphdr *  iph;
-	struct sockaddr saddr={0};
+	struct sockaddr_ll saddr={0};
 	socklen_t saddr_size = sizeof(saddr);
 	int flag=0;
-	int recv_len = recvfrom(raw_recv_fd, recv_raw_ip_buf, max_data_len, flag ,&saddr , &saddr_size);
+	int recv_len = recvfrom(raw_recv_fd, recv_raw_ip_buf, max_data_len, flag ,(sockaddr*)&saddr , &saddr_size);

 	if(recv_len<0)
 	{
@@ -485,6 +701,10 @@ int recv_raw_ip(raw_info_t &raw_info,char * &payload,int &payloadlen)
 	recv_info.dst_ip=iph->daddr;
 	recv_info.protocol=iph->protocol;

+	if(lower_level)
+	{
+		memcpy(&recv_info.addr_ll,&saddr,sizeof(recv_info.addr_ll));
+	}


 	if(bind_address_uint32!=0 &&recv_info.dst_ip!=bind_address_uint32)
--- a/network.h
+++ b/network.h
@@ -16,8 +16,11 @@ extern u32_t bind_address_uint32;
 extern int disable_bpf_filter;

 extern int lower_level;
+extern int lower_level_manual;
 extern char if_name[100];
-extern unsigned char oppsite_hw_addr[];
+extern unsigned char dest_hw_addr[];
+
+extern int ifindex;

 struct icmphdr
 {
@@ -58,6 +61,9 @@ struct packet_info_t  //todo change this to union
 	uint16_t icmp_seq;

 	bool has_ts;
+
+	sockaddr_ll addr_ll;
+
 	packet_info_t();
 };

@@ -80,8 +86,9 @@ int init_raw_socket();
 void init_filter(int port);

 void remove_filter();
-int init_ifindex(char * if_name);
+int init_ifindex(const char * if_name,int &index);

+int find_lower_level_info(u32_t ip,u32_t &dest_ip,string &if_name,string &hw);

 int send_raw_ip(raw_info_t &raw_info,const char * payload,int payloadlen);
Author	SHA1	Message	Date
wangyu-	57be59e070	trival	2017-08-26 06:56:30 -05:00
wangyu-	fe77bb0c1a	avoid allocating large buf on stack	2017-08-26 06:35:47 -05:00
wangyu-	cb9504f904	Merge branch 'master' of https://github.com/wangyu-/udp2raw-tunnel	2017-08-26 05:38:40 -05:00
wangyu-	dc4936dc60	implemented --lower-level auto for client. better makefile	2017-08-26 05:38:33 -05:00
wangyu-	53609c25fc	Update openvpn_guide.md	2017-08-25 19:14:17 -07:00
wangyu-	60efcbce95	Merge pull request #35 from lance0/patch-1 typo and formatting.md	2017-08-25 08:55:51 -07:00
Lance Tuller	5c4ea515f6	Update README.md	2017-08-25 11:42:22 -04:00
Lance Tuller	e3b902a950	Update README.md formatting and a typo :)	2017-08-25 11:40:48 -04:00
wangyu-	37f2de2ae4	Update openvpn_guide.md	2017-08-24 01:47:52 -07:00
wangyu-	ae497908a1	Update README.md	2017-08-24 01:06:51 -07:00
wangyu-	b59ba05422	Update README.md	2017-08-24 01:05:12 -07:00
wangyu-	cd34922722	Update README.md	2017-08-24 00:49:28 -07:00
wangyu-	51e5b023b0	Update README.md	2017-08-24 00:35:13 -07:00
wangyu-	b1800fa41a	Update README.md	2017-08-24 00:23:58 -07:00
wangyu-	698ff33892	Merge branch 'master' of https://github.com/wangyu-/udp2raw-tunnel	2017-08-23 10:49:33 -05:00
wangyu-	1293b95335	bug fix and improve for conf	2017-08-23 10:47:09 -05:00
wangyu-	7e74b40977	Update README.zh-cn.md	2017-08-23 06:48:21 -07:00
wangyu-	d1e9bdc5da	Update README.zh-cn.md	2017-08-23 06:47:05 -07:00
wangyu-	2b9d5c6db6	Update README.md	2017-08-23 06:24:46 -07:00
wangyu-	b86d475f1f	moved cmakeLists	2017-08-23 07:23:02 -05:00
wangyu-	0a7f9b5cc6	fixed conf bugs and refactor	2017-08-23 07:01:21 -05:00
wangyu-	eb9633ed59	Update README.md	2017-08-22 22:08:51 -07:00
wangyu-	c0b464aeaf	Update README.md	2017-08-22 22:08:04 -07:00
wangyu-	020242b73d	Merge pull request #30 from PeterCxy/patch-config-file Allow loading options from configuration files	2017-08-22 22:01:28 -07:00
Peter Cai	2269879a2e	doc: note about comments	2017-08-23 11:22:33 +08:00
Peter Cai	057f81007d	main: don't warn config-file as 'unknown'	2017-08-23 11:16:41 +08:00
Peter Cai	1f2f0d96fd	main: fix missing includes	2017-08-23 11:09:46 +08:00
Peter Cai	01e0e51b9b	config: treat TABs as spaces	2017-08-23 11:03:27 +08:00
Peter Cai	6ef38709a6	config: trim spaces and tabs	2017-08-23 11:03:24 +08:00
Peter Cai	57b874ee6d	main: ignore empty lines in configuration	2017-08-23 11:03:21 +08:00
Peter Cai	ab0ce3ade3	main: merge cli/config arguments and check for duplications	2017-08-23 11:03:18 +08:00
Peter Cai	32f344fd3c	README: document configuration files	2017-08-23 11:03:16 +08:00
Peter Cai	968d35178a	main: allow loading configuration files (--config-file)	2017-08-23 11:03:12 +08:00
wangyu-	f8e268769c	trival	2017-08-22 21:10:32 -05:00
wangyu-	b9030fe81d	deleted useless headers	2017-08-22 11:00:44 -05:00
wangyu-	e6f0ed035c	Update README.md	2017-08-22 00:58:04 -07:00
wangyu-	00e1ddfecc	Update README.md	2017-08-21 21:20:24 -07:00
wangyu-	15e1790f73	Update README.md	2017-08-21 21:05:50 -07:00
wangyu-	763b0b7342	Update README.md	2017-08-21 21:03:16 -07:00
wangyu-	c716d617a0	Update README.md	2017-08-21 20:34:05 -07:00
wangyu-	60393d24d8	Merge pull request #33 from YUChoe/master Fix Typo TCP over TCP	2017-08-21 20:04:30 -07:00
Yong-uk Choe	9e7ed280fc	Fix Typo TCP over TCP	2017-08-22 09:44:19 +09:00
root	532693edc6	Merge branch 'master' of https://github.com/wangyu-/udp2raw-tunnel	2017-08-21 23:32:21 +08:00
root	033fa830f7	trival	2017-08-21 23:31:55 +08:00
wangyu-	51464ecbfe	Update README.zh-cn.md	2017-08-21 08:07:06 -07:00
root	dd8539e420	fixed --lower-level auto bug	2017-08-21 21:49:29 +08:00
wangyu	19ce820813	added new options to help page	2017-08-21 20:42:59 +08:00
root	b30a347c23	implemented --lower-level auto for server	2017-08-21 20:26:55 +08:00
wangyu-	3a35c5ce5b	Update build_guide.md	2017-08-21 02:51:00 -07:00
wangyu-	bab98d8356	Update build_guide.md	2017-08-21 02:50:49 -07:00
wangyu-	283167802c	Update build_guide.zh-cn.md	2017-08-21 02:39:43 -07:00
wangyu-	1db6509373	Update build_guide.zh-cn.md	2017-08-21 02:38:07 -07:00
wangyu-	a29cbf3779	Update openvpn_guide.md	2017-08-20 10:55:55 -07:00
wangyu-	da26c5047b	Update README.md	2017-08-20 09:11:03 -07:00
wangyu-	38d98b5b37	Update README.md	2017-08-20 09:01:25 -07:00
root	497320e446	added --keep-rule --gen-add option	2017-08-20 16:28:23 +08:00
wangyu	19da7b6972	added -simple rule option,added mips_asm_aes target	2017-08-19 18:17:39 +08:00
wangyu-	687666cf05	Merge pull request #24 from linusyang/md5 Use md5 hash from PolarSSL	2017-08-19 02:59:34 -07:00
Linus Yang	20f8aa743b	Use md5 hash from PolarSSL	2017-08-19 17:51:17 +08:00
wangyu-	8ce51fde50	Merge pull request #22 from linusyang/sep Separate encryption and decryption	2017-08-18 22:45:19 -07:00
Linus Yang	b9d7a91cae	Separate en/decryption	2017-08-19 13:22:31 +08:00
wangyu-	63dd25a0b5	Update README.zh-cn.md	2017-08-18 21:40:03 -07:00
wangyu-	c894596e06	Update README.zh-cn.md	2017-08-18 21:39:12 -07:00
wangyu-	558865f52d	Update README.zh-cn.md	2017-08-18 21:37:46 -07:00
wangyu-	604ff2978f	Update README.zh-cn.md	2017-08-18 21:36:41 -07:00
wangyu-	9046fd5889	Update README.zh-cn.md	2017-08-18 21:34:37 -07:00
wangyu-	241d305570	Update README.md	2017-08-18 21:29:08 -07:00
wangyu	edb3edb2cc	trival	2017-08-19 12:25:50 +08:00
wangyu	d9a24a5e42	added gitattributes	2017-08-19 12:19:45 +08:00
wangyu-	f9cf1c36a3	Merge pull request #21 from linusyang/asm Port more assembly code for AES acceleration	2017-08-18 16:27:45 -07:00
Linus Yang	4d797fb55d	Only use assembly code	2017-08-19 00:53:05 +08:00
Linus Yang	8d1e735041	Add mips big-endian asm	2017-08-19 00:53:05 +08:00
wangyu	769b99546b	added log,now listeing	2017-08-18 23:37:21 +08:00
wangyu	7d9ef910ca	add x86 asm_aes_target	2017-08-18 20:56:58 +08:00
wangyu	5f2838573a	added arm_asm_aes target	2017-08-18 20:09:13 +08:00
wangyu	2698ec9395	add arm_asm_aes target	2017-08-18 19:57:37 +08:00
wangyu	d09e0c51aa	deleted my encrypt_old	2017-08-18 19:43:46 +08:00
wangyu	7d306b2451	refactor	2017-08-18 19:42:29 +08:00
wangyu	d6d000e667	refactor	2017-08-18 19:35:05 +08:00
wangyu	00a0fe17bf	added eclipse project file	2017-08-18 19:26:54 +08:00
root	b74691f40a	refactor	2017-08-18 19:24:49 +08:00
wangyu	12741d9b9b	better bpf filter	2017-08-18 18:23:40 +08:00
wangyu-	ee4fe6cfcf	Merge pull request #16 from linusyang/aes Hardware-accelerated AES crypto on 64-bit Intel and ARM	2017-08-18 03:24:08 -07:00
Linus Yang	466ce76eca	Port OpenSSL asm code	2017-08-18 17:39:33 +08:00
Linus Yang	20ab7d920d	Use accelerated AES wrapper by default	2017-08-18 14:51:29 +08:00
Linus Yang	627e55932f	AES acceleration for x86_64 and arm64	2017-08-18 14:51:24 +08:00
wangyu-	91c427ebe4	Update README.zh-cn.md	2017-08-17 23:31:38 -07:00
wangyu-	4cf1dc7801	Update README.zh-cn.md	2017-08-17 23:29:26 -07:00
wangyu-	acc47afe29	Add files via upload	2017-08-17 23:26:29 -07:00
wangyu-	29cfbeb678	Update README.zh-cn.md	2017-08-17 23:20:24 -07:00
wangyu-	fb0daf5994	Update README.zh-cn.md	2017-08-17 23:18:14 -07:00
wangyu-	35af7008ef	Update README.zh-cn.md	2017-08-17 23:17:16 -07:00
wangyu-	af8160870b	Update README.zh-cn.md	2017-08-17 10:02:01 -07:00
wangyu-	effbda0918	Update README.md	2017-08-17 10:01:32 -07:00
wangyu	6c578738ca	trival	2017-08-18 00:58:40 +08:00
wangyu	a33133c3de	trival	2017-08-18 00:50:56 +08:00
root	eea016cab4	Merge branch 'master' of https://github.com/wangyu-/udp2raw-tunnel	2017-08-18 00:29:30 +08:00
wangyu	8356b45c3b	trival	2017-08-18 00:23:18 +08:00
wangyu	e502076394	fixed help page.some new function in common	2017-08-17 23:40:17 +08:00
wangyu-	1ca79a07b6	Update finalspeed_step_by_step.md	2017-08-16 22:11:29 -07:00
wangyu-	b35d8ecd34	Update kcptun_step_by_step.md	2017-08-16 21:46:54 -07:00
wangyu-	5ed3bae33d	Update kcptun_step_by_step.md	2017-08-16 21:46:36 -07:00
wangyu-	ec298b355e	Update finalspeed_step_by_step.md	2017-08-16 21:42:04 -07:00
wangyu-	33fb57d6d0	Update finalspeed_step_by_step.md	2017-08-16 21:36:03 -07:00
wangyu-	bdc7e3e43f	Update finalspeed_step_by_step.md	2017-08-16 21:30:07 -07:00
wangyu-	b10d9421d3	Update finalspeed_step_by_step.md	2017-08-16 21:29:38 -07:00
wangyu-	bdc1f74f8f	Update finalspeed_step_by_step.md	2017-08-16 21:28:18 -07:00
wangyu-	5cca489825	Add files via upload	2017-08-16 21:00:23 -07:00
wangyu-	3897ac3847	Update android_guide.md	2017-08-16 19:47:19 -07:00
wangyu-	231fd05680	Update README.zh-cn.md	2017-08-16 10:20:01 -07:00
wangyu-	3a362f6598	Update README.md	2017-08-16 08:58:00 -07:00
wangyu-	99b3ddf6a1	Update android_guide.md	2017-08-16 06:43:05 -07:00
wangyu-	09b00ef07b	Update android_guide.md	2017-08-16 06:24:45 -07:00
wangyu-	c0628959d4	Update README.zh-cn.md	2017-08-16 06:23:22 -07:00
wangyu-	9aa229569d	Update README.zh-cn.md	2017-08-16 06:23:03 -07:00
wangyu-	842766ae76	Update README.md	2017-08-16 06:21:33 -07:00
wangyu-	9e5b2140b3	Add files via upload	2017-08-16 06:18:00 -07:00
root	ac02ea91d7	Merge branch 'master' of https://github.com/wangyu-/udp2raw-tunnel	2017-08-16 19:45:26 +08:00
wangyu	837de123b2	added android target to makefile,fixed some log	2017-08-16 19:44:19 +08:00
wangyu-	358c5c47ff	Update README.zh-cn.md	2017-08-16 04:42:06 -07:00
wangyu-	19102700a5	Update README.md	2017-08-16 04:41:40 -07:00
wangyu-	f848839232	Update README.zh-cn.md	2017-08-16 04:38:19 -07:00
wangyu-	a13269a0b0	Update README.zh-cn.md	2017-08-16 04:37:39 -07:00
wangyu-	1d8223632b	Update README.md	2017-08-16 04:35:46 -07:00
wangyu-	e459b74bdf	Update README.md	2017-08-16 04:35:15 -07:00
wangyu-	9429a1c7db	Update README.zh-cn.md	2017-08-16 04:32:09 -07:00
wangyu-	f5d93cb80d	Update README.md	2017-08-16 04:29:35 -07:00
wangyu-	f591ba101c	Update android_guide.md	2017-08-16 04:27:50 -07:00
wangyu-	5236bc2f4e	Create android_guide.md	2017-08-16 04:26:06 -07:00
wangyu-	c7d9312cd2	Delete android_guide	2017-08-16 04:25:46 -07:00
wangyu-	85dfdd416f	Create android_guide	2017-08-16 04:25:19 -07:00
wangyu-	7a62918b74	Add files via upload	2017-08-16 04:15:39 -07:00
wangyu-	138b34335c	Update README.zh-cn.md	2017-08-16 04:12:34 -07:00
wangyu-	c5c74a3f35	Update README.md	2017-08-16 04:10:50 -07:00
wangyu-	0a4f53e579	Update README.md	2017-08-15 08:53:34 -07:00
wangyu-	dc41a3dbad	Update README.md	2017-08-15 08:48:31 -07:00
wangyu-	bd08278f1b	Update README.zh-cn.md	2017-08-15 08:29:02 -07:00
wangyu-	bcab893c53	Update README.md	2017-08-15 08:25:31 -07:00
wangyu-	f3f43b85d2	Update README.md	2017-08-15 08:24:54 -07:00
wangyu-	ed354347fd	Update README.md	2017-08-15 08:23:15 -07:00
wangyu-	fb32c56fb2	Update README.zh-cn.md	2017-08-15 08:09:24 -07:00