Initial Commit (tested training, testing, and TRT conversion)

2024-10-20 17:01:07 +08:00
parent 86d2f311f8
commit 5738088bae
221 changed files with 59249 additions and 6 deletions
--- a/flightlib/include/flightlib/bridges/unity_bridge.hpp
+++ b/flightlib/include/flightlib/bridges/unity_bridge.hpp
@@ -0,0 +1,115 @@
+//
+// This bridge was originally from FlightGoggles.
+// We made several changes on top of it.
+//
+#pragma once
+
+// std libs
+#include <opencv2/imgproc/types_c.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include <experimental/filesystem>
+#include <fstream>
+#include <map>
+#include <random>
+#include <string>
+#include <thread>
+#include <unordered_map>
+
+// Include ZMQ bindings for communications with Unity.
+#include <zmqpp/zmqpp.hpp>
+
+// flightlib
+#include "flightlib/bridges/unity_message_types.hpp"
+#include "flightlib/common/logger.hpp"
+#include "flightlib/common/math.hpp"
+#include "flightlib/common/quad_state.hpp"
+#include "flightlib/common/types.hpp"
+#include "flightlib/objects/quadrotor.hpp"
+#include "flightlib/objects/static_object.hpp"
+#include "flightlib/objects/unity_camera.hpp"
+#include "flightlib/sensors/rgb_camera.hpp"
+
+using json = nlohmann::json;
+
+namespace flightlib {
+
+class UnityBridge {
+   public:
+	// constructor & destructor
+	UnityBridge();
+	~UnityBridge() {};
+
+	// connect function
+	bool initializeConnections(void);
+	bool connectUnity(const SceneID scene_id);
+	bool disconnectUnity(void);
+
+	// public get functions
+	bool getRender(const FrameID frame_id);
+	bool handleOutput(FrameID &frameID);
+	void refreshUnity(FrameID id);
+	void generatePointcloud(const Ref<Vector<3>> &min_corner, const Ref<Vector<3>> &max_corner, int ply_idx, std::string path, SceneID scene_id,
+	                        Scalar pc_resolution_ = 0.2);
+
+	// public set functions
+	bool setScene(const SceneID &scene_id);
+
+	// add object
+	bool addQuadrotor(std::shared_ptr<Quadrotor> quad);
+	bool addCamera(std::shared_ptr<UnityCamera> unity_camera);
+	bool addStaticObject(std::shared_ptr<StaticObject> static_object);
+
+	// public auxiliary functions
+	inline void setPubPort(const std::string &pub_port) { pub_port_ = pub_port; };
+	inline void setSubPort(const std::string &sub_port) { sub_port_ = sub_port; };
+	// create unity bridge
+	static std::shared_ptr<UnityBridge> getInstance(void) {
+		static std::shared_ptr<UnityBridge> bridge_ptr = std::make_shared<UnityBridge>();
+		return bridge_ptr;
+	};
+
+	// add tree
+	bool placeTrees(TreeMessage_t &tree_msg);
+	void rmTrees();
+	void pointCloudGenerator(PointCloudMessage_t &pcd_msg);
+	bool spawnTrees(Ref<Vector<3>> bounding_box_, Ref<Vector<3>> bounding_box_origin_, Scalar avg_tree_spacing_);
+
+   private:
+	template<typename T>
+	std::vector<double> convertToDoubleVector(const std::vector<T> &input);
+	//
+	SettingsMessage_t settings_;
+	PubMessage_t pub_msg_;
+	Logger logger_{"UnityBridge"};
+
+	std::vector<std::shared_ptr<Quadrotor>> unity_quadrotors_;
+	std::vector<std::shared_ptr<RGBCamera>> rgb_cameras_;
+	std::vector<std::shared_ptr<StaticObject>> static_objects_;
+
+	// ZMQ variables and functions
+	std::string client_address_;
+	std::string pub_port_;
+	std::string sub_port_;
+	zmqpp::context context_;
+	zmqpp::socket pub_{context_, zmqpp::socket_type::publish};
+	zmqpp::socket sub_{context_, zmqpp::socket_type::subscribe};
+	bool sendInitialSettings(void);
+	bool handleSettings(void);
+
+	// timing variables
+	int64_t num_frames_;
+	int64_t last_downloaded_utime_;
+	int64_t last_download_debug_utime_;
+	int64_t u_packet_latency_;
+
+	// axuiliary variables
+	const Scalar unity_connection_time_out_{60.0};
+	bool unity_ready_{false};
+
+	// Used for trees
+	std::random_device random_device_;
+	std::default_random_engine generator_;
+};
+}  // namespace flightlib
--- a/flightlib/include/flightlib/bridges/unity_message_types.hpp
+++ b/flightlib/include/flightlib/bridges/unity_message_types.hpp
@@ -0,0 +1,309 @@
+//
+// This bridge message types was originally from FlightGoggles.
+// We made several changes on top of it.
+//
+#pragma once
+
+// std
+#include <string>
+
+// opencv
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/opencv.hpp>
+
+// flightlib
+#include "flightlib/common/types.hpp"
+#include "flightlib/json/json.hpp"
+
+using json = nlohmann::json;
+
+namespace flightlib {
+
+enum UnityScene {
+  WAREHOUSE = 0,
+  NATUREFOREST = 3,
+  SCENE_EMPTYFOREST = 4,
+  SCENE_GRANDCANYON = 5,
+  SCENE_WASTELAND = 6,
+  SCENE_JAPANESE = 7,
+  // total number of environment
+  SceneNum = 6
+};
+
+struct Camera_t {
+  std::string ID;
+  // frame Metadata
+  // int64_t ntime; // deprecated
+  int channels{3};
+  int width{1024};
+  int height{768};
+  double fov{70.0f};
+  double depth_scale{0.20};  // 0.xx corresponds to xx cm resolution
+  // metadata
+  bool is_depth{false};
+  int output_index{0};
+  //
+  std::vector<std::string> post_processing;
+  // Transformation matrix from camera to vehicle body 4 x 4
+  // use 1-D vector for json convention
+  std::vector<double> T_BC{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+                           0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+};
+
+struct Lidar_t {
+  std::string ID;
+  int num_beams{10};
+  double max_distance{10.0};
+  double start_scan_angle{-M_PI / 2};
+  double end_scan_angle{M_PI / 2};
+  // Transformation matrix from lidar to vehicle body 4 x 4
+  // use 1-D vector for json convention
+  std::vector<double> T_BS{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+                           0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+};
+
+// Unity Vechicle, e.g., quadrotor
+struct Vehicle_t {
+  std::string ID;
+  // unity coordinate system left-handed, y up
+  std::vector<double> position{0.0, 0.0, 0.0};
+  // unity quaternion (x, y, z, w)
+  std::vector<double> rotation{0.0, 0.0, 0.0, 1.0};
+  std::vector<double> size{1.0, 1.0, 1.0};  // scale
+  // sensors attached on the vehicle
+  std::vector<Camera_t> cameras;
+  std::vector<Lidar_t> lidars;
+  // collision check
+  bool has_collision_check = true;
+};
+
+// Unity object, e.g., gate, light, etc...
+struct Object_t {
+  std::string ID;
+  std::string prefab_ID;
+  // unity coordinate system left hand, y up
+  std::vector<double> position{0.0, 0.0, 0.0};
+  // unity quaternion (x, y, z, w)
+  std::vector<double> rotation{0.0, 0.0, 0.0, 1.0};
+  std::vector<double> size{1.0, 1.0, 1.0};  // scale
+};
+
+struct SettingsMessage_t {
+  // scene/render settings
+  size_t scene_id = UnityScene::WAREHOUSE;
+
+  //
+  std::vector<Vehicle_t> vehicles;
+  std::vector<Object_t> objects;
+};
+
+struct PubMessage_t {
+  FrameID ntime{0};
+  std::vector<Vehicle_t> vehicles;
+  std::vector<Object_t> objects;
+};
+
+//
+struct Sub_Vehicle_t {
+  bool collision;
+  std::vector<double> lidar_ranges;
+};
+
+struct SubMessage_t {
+  //
+  FrameID ntime{0};
+  //
+  std::vector<Sub_Vehicle_t> sub_vehicles;
+};
+
+struct PointCloudMessage_t {
+  int scene_id{0};
+  std::vector<double> bounding_box_scale{100.0, 100.0, 100.0};
+  std::vector<double> bounding_box_origin{0.0, 0.0, 0.0};
+  double resolution_z{0.1};
+  double ground_z_limit{0.2};
+  double resolution_above_ground{0.1};
+  double resolution_below_ground{0.1};
+  std::string path{"point_clouds_data/"};
+  std::string file_name{"default"};
+  double unity_ground_offset{0.3};
+};
+
+struct TreeMessage_t {
+  std::vector<double> bounding_area{25.0, 25.0};
+  std::vector<double> bounding_origin{0.0, 0.0};
+  double density{4.0};
+  int seed{-1};
+};
+
+struct ObjectMessage_t {
+  std::vector<std::string> name = {"Cube"};  // Cube, Sphere, Cylinder
+  std::vector<double> bounding_area{25.0, 25.0, 25.0};
+  std::vector<double> bounding_origin{0.0, 0.0, 0.0};
+  double density{4.0};
+  double rand_size{0.0};
+  std::vector<double> scale_min{
+      0.1,
+      0.3,
+      2.0,
+  };
+  std::vector<double> scale_max{1.0, 0.3, 10.0};
+
+  std::vector<double> rot_min{0.0, 0.0, 0.0};
+  std::vector<double> rot_max{360.0, 360.0, 360.0};
+  int seed{-1};
+};
+
+struct FixRatioObjectMessage_t {
+  std::string name = "Cube";  // Cube, Sphere, Cylinder
+  std::vector<double> bounding_area{25.0, 25.0};
+  std::vector<double> bounding_origin{0.0, 0.0};
+  double density{4.0};
+  double scale_min{0.1};
+  double scale_max{2.0};
+  double rot_min{0.0};  // rotation around z-axix
+  double rot_max{360.0};
+  int seed{-1};
+};
+
+struct LightMessage_t {
+  double red{0};
+  double green{0};
+  double blue{0};
+  double intensity{1.0};
+};
+
+/*********************
+ * JSON constructors *
+ *********************/
+// Camera_t
+inline void to_json(json &j, const Camera_t &o) {
+  j = json{{"ID", o.ID},
+           {"channels", o.channels},
+           {"width", o.width},
+           {"height", o.height},
+           {"fov", o.fov},
+           {"T_BC", o.T_BC},
+           {"isDepth", o.is_depth},
+           {"post_processing", o.post_processing},
+           {"depthScale", o.depth_scale},
+           {"outputIndex", o.output_index}};
+}
+
+// Lidar
+inline void to_json(json &j, const Lidar_t &o) {
+  j = json{{"ID", o.ID},
+           {"num_beams", o.num_beams},
+           {"max_distance", o.max_distance},
+           {"start_angle", o.start_scan_angle},
+           {"end_angle", o.end_scan_angle},
+           {"T_BS", o.T_BS}};
+}
+// Vehicle_t
+inline void to_json(json &j, const Vehicle_t &o) {
+  j = json{{"ID", o.ID},
+           {"position", o.position},
+           {"rotation", o.rotation},
+           {"size", o.size},
+           {"cameras", o.cameras},
+           {"lidars", o.lidars},
+           {"hasCollisionCheck", o.has_collision_check}};
+}
+
+// Object_t
+inline void to_json(json &j, const Object_t &o) {
+  j = json{{"ID", o.ID},
+           {"prefabID", o.prefab_ID},
+           {"position", o.position},
+           {"rotation", o.rotation},
+           {"size", o.size}};
+}
+
+// Setting messages, pub to unity
+inline void to_json(json &j, const SettingsMessage_t &o) {
+  j = json{{"scene_id", o.scene_id},
+           {"vehicles", o.vehicles},
+           {"objects", o.objects}};
+}
+
+
+// Publish messages to unity
+inline void to_json(json &j, const PubMessage_t &o) {
+  j = json{
+      {"ntime", o.ntime}, {"vehicles", o.vehicles}, {"objects", o.objects}};
+}
+
+// Publish messages to unity
+inline void from_json(const json &j, Sub_Vehicle_t &o) {
+  o.collision = j.at("collision").get<bool>();
+  o.lidar_ranges = j.at("lidar_ranges").get<std::vector<double>>();
+}
+
+// json to our sub message data type
+// note: pub_vechicles is defined in Unity which corresponding
+// to our sub_vehicles in ROS.
+inline void from_json(const json &j, SubMessage_t &o) {
+  o.ntime = j.at("ntime").get<uint64_t>();
+  o.sub_vehicles = j.at("pub_vehicles").get<std::vector<Sub_Vehicle_t>>();
+}
+
+inline void to_json(json &j, const PointCloudMessage_t &o) {
+  j = json{{"scene_id", o.scene_id},
+           {"bounding_box_scale", o.bounding_box_scale},
+           {"bounding_box_origin", o.bounding_box_origin},
+           {"resolution_z", o.resolution_z},
+           {"ground_z_limit", o.ground_z_limit},
+           {"resolution_above_ground", o.resolution_above_ground},
+           {"resolution_below_ground", o.resolution_below_ground},
+           {"path", o.path},
+           {"file_name", o.file_name},
+           {"unity_ground_offset", o.unity_ground_offset}};
+}
+
+inline void to_json(json &j, const TreeMessage_t &o) {
+  j = json{{"bounding_area", o.bounding_area},
+           {"bounding_origin", o.bounding_origin},
+           {"density", o.density},
+           {"seed", o.seed}};
+}
+
+inline void to_json(json &j, const ObjectMessage_t &o) {
+  j = json{{"bounding_area", o.bounding_area},
+           {"bounding_origin", o.bounding_origin},
+           {"density", o.density},
+           {"rand_size", o.rand_size},
+           {"seed", o.seed},
+           {"name", o.name},
+           {"scale_min", o.scale_min},
+           {"scale_max", o.scale_max},
+           {"rot_min", o.rot_min},
+           {"rot_max", o.rot_max}};
+}
+
+inline void to_json(json &j, const FixRatioObjectMessage_t &o) {
+  j = json{{"bounding_area", o.bounding_area},
+           {"bounding_origin", o.bounding_origin},
+           {"density", o.density},
+           {"seed", o.seed},
+           {"name", o.name},
+           {"scale_min", o.scale_min},
+           {"scale_max", o.scale_max},
+           {"rot_min", o.rot_min},
+           {"rot_max", o.rot_max}};
+}
+
+inline void to_json(json &j, const LightMessage_t &o) {
+  j = json{{"red", o.red},
+           {"green", o.green},
+           {"blue", o.blue},
+           {"intensity", o.intensity}};
+}
+
+// Struct for outputting parsed received messages to handler functions
+struct RenderMessage_t {
+  SubMessage_t sub_msg;
+  std::vector<cv::Mat> images;
+};
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/common/command.hpp
+++ b/flightlib/include/flightlib/common/command.hpp
@@ -0,0 +1,37 @@
+
+#pragma once
+
+#include <cmath>
+
+#include "flightlib/common/types.hpp"
+
+namespace flightlib {
+
+struct Command {
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+
+  Command();
+
+  Command(const Scalar t, const Scalar thrust, const Vector<3>& omega);
+
+  Command(const Scalar t, const Vector<4>& thrusts);
+
+  bool valid() const;
+  bool isSingleRotorThrusts() const;
+  bool isRatesThrust() const;
+
+  /// time in [s]
+  Scalar t{NAN};
+
+  /// Collective mass-normalized thrust in [m/s^2]
+  Scalar collective_thrust{NAN};
+
+  /// Bodyrates in [rad/s]
+  Vector<3> omega{NAN, NAN, NAN};
+
+  /// Single rotor thrusts in [N]
+  Vector<4> thrusts{NAN, NAN, NAN, NAN};
+
+};
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/common/integrator_base.hpp
+++ b/flightlib/include/flightlib/common/integrator_base.hpp
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <functional>
+
+#include "flightlib/common/quad_state.hpp"
+#include "flightlib/common/types.hpp"
+
+namespace flightlib {
+
+class IntegratorBase {
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+ public:
+  using DynamicsFunction =
+    std::function<bool(const Ref<const Vector<>>, Ref<Vector<>>)>;
+  IntegratorBase(DynamicsFunction function, const Scalar dt_max = 1e-3);
+
+  bool integrate(const QuadState& initial, QuadState* const final) const;
+
+  bool integrate(const Ref<const Vector<>> initial, const Scalar dt,
+                 Ref<Vector<>> final) const;
+
+  virtual bool step(const Ref<const Vector<>> initial, const Scalar dt,
+                    Ref<Vector<>> final) const = 0;
+
+  Scalar dtMax() const;
+
+ protected:
+  DynamicsFunction dynamics_;
+  Scalar dt_max_;
+};
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/common/integrator_euler.hpp
+++ b/flightlib/include/flightlib/common/integrator_euler.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "flightlib/common/integrator_base.hpp"
+
+namespace flightlib {
+
+class IntegratorEuler : public IntegratorBase {
+ public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+  using IntegratorBase::DynamicsFunction;
+  using IntegratorBase::IntegratorBase;
+
+  bool step(const Ref<const Vector<>> initial, const Scalar dt,
+            Ref<Vector<>> final) const;
+};
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/common/integrator_rk4.hpp
+++ b/flightlib/include/flightlib/common/integrator_rk4.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "flightlib/common/integrator_base.hpp"
+
+namespace flightlib {
+
+class IntegratorRK4 : public IntegratorBase {
+ public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+  using IntegratorBase::DynamicsFunction;
+  using IntegratorBase::IntegratorBase;
+
+  bool step(const Ref<const Vector<>> initial, const Scalar dt,
+            Ref<Vector<>> final) const;
+};
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/common/logger.hpp
+++ b/flightlib/include/flightlib/common/logger.hpp
@@ -0,0 +1,102 @@
+// """credit: Philipp Foehn """
+#pragma once
+
+#include <cstdio>
+#include <fstream>
+#include <iostream>
+#include <string>
+
+namespace flightlib {
+
+class Logger {
+ public:
+  Logger(const std::string& name, const bool color = true);
+  Logger(const std::string& name, const std::string& filename);
+  ~Logger();
+
+  inline std::streamsize precision(const std::streamsize n);
+  inline void scientific(const bool on = true);
+
+  template<class... Args>
+  void info(const std::string& message, const Args&... args) const;
+  void info(const std::string& message) const;
+
+  template<class... Args>
+  void warn(const std::string& message, const Args&... args) const;
+  void warn(const std::string& message) const;
+
+  template<class... Args>
+  void error(const std::string& message, const Args&... args) const;
+  void error(const std::string& message) const;
+
+  template<class... Args>
+  void fatal(const std::string& message, const Args&... args) const;
+  void fatal(const std::string& message) const;
+
+  template<typename T>
+  std::ostream& operator<<(const T& printable) const;
+
+  static constexpr int MAX_CHARS = 256;
+
+ private:
+  static constexpr int DEFAULT_PRECISION = 3;
+  static constexpr int NAME_PADDING = 15;
+  static constexpr char RESET[] = "\033[0m";
+  static constexpr char RED[] = "\033[31m";
+  static constexpr char YELLOW[] = "\033[33m";
+  static constexpr char INFO[] = "Info:    ";
+  static constexpr char WARN[] = "Warning: ";
+  static constexpr char ERROR[] = "Error:   ";
+  static constexpr char FATAL[] = "Fatal:   ";
+  //
+  std::string name_;
+  mutable std::ostream sink_;
+  const bool colored_;
+};
+
+template<class... Args>
+void Logger::info(const std::string& message, const Args&... args) const {
+  char buf[MAX_CHARS];
+  const int n = std::snprintf(buf, MAX_CHARS, message.c_str(), args...);
+  if (n >= 0 && n < MAX_CHARS)
+    info(buf);
+  else
+    error("=== Logging error ===\n");
+}
+
+template<class... Args>
+void Logger::warn(const std::string& message, const Args&... args) const {
+  char buf[MAX_CHARS];
+  const int n = std::snprintf(buf, MAX_CHARS, message.c_str(), args...);
+  if (n >= 0 && n < MAX_CHARS)
+    warn(buf);
+  else
+    error("=== Logging error ===\n");
+}
+
+template<class... Args>
+void Logger::error(const std::string& message, const Args&... args) const {
+  char buf[MAX_CHARS];
+  const int n = std::snprintf(buf, MAX_CHARS, message.c_str(), args...);
+  if (n >= 0 && n < MAX_CHARS)
+    error(buf);
+  else
+    error("=== Logging error ===\n");
+}
+
+template<class... Args>
+void Logger::fatal(const std::string& message, const Args&... args) const {
+  char buf[MAX_CHARS];
+  const int n = std::snprintf(buf, MAX_CHARS, message.c_str(), args...);
+  if (n >= 0 && n < MAX_CHARS)
+    fatal(buf);
+  else
+    fatal("=== Logging error ===\n");
+}
+
+template<typename T>
+std::ostream& Logger::operator<<(const T& printable) const {
+  return sink_ << name_ << printable;
+}
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/common/math.hpp
+++ b/flightlib/include/flightlib/common/math.hpp
@@ -0,0 +1,54 @@
+#pragma once
+
+#include "flightlib/common/types.hpp"
+
+namespace flightlib {
+
+Matrix<3, 3> skew(const Vector<3>& v);
+
+Matrix<4, 4> Q_left(const Quaternion& q);
+
+Matrix<4, 4> Q_right(const Quaternion& q);
+
+Matrix<4, 3> qFromQeJacobian(const Quaternion& q);
+
+Matrix<4, 4> qConjugateJacobian();
+
+Matrix<3, 3> qeRotJacobian(const Quaternion& q, const Matrix<3, 1>& t);
+
+Matrix<3, 3> qeInvRotJacobian(const Quaternion& q, const Matrix<3, 1>& t);
+
+void matrixToTripletList(const SparseMatrix& matrix,
+                         std::vector<SparseTriplet>* const list,
+                         const int row_offset = 0, const int col_offset = 0);
+
+void matrixToTripletList(const Matrix<>& matrix,
+                         std::vector<SparseTriplet>* const list,
+                         const int row_offset = 0, const int col_offset = 0);
+
+void insert(const SparseMatrix& from, SparseMatrix* const into,
+            const int row_offset = 0, const int col_offset = 0);
+
+void insert(const Matrix<>& from, SparseMatrix* const into,
+            const int row_offset = 0, const int col_offset = 0);
+
+void insert(const Matrix<>& from, Matrix<>* const into,
+            const int row_offset = 0, const int col_offset = 0);
+
+void quaternionToEuler(const Quaternion& quat, Ref<Vector<3>> euler);
+
+std::vector<Scalar> transformationRos2Unity(const Matrix<4, 4>& ros_tran_mat);
+
+std::vector<Scalar> positionRos2Unity(const Vector<3>& ros_pos_vec);
+
+std::vector<Scalar> quaternionRos2Unity(const Quaternion& ros_quat);
+
+std::vector<Scalar> scalarRos2Unity(const Vector<3>& ros_scale);
+
+void get_euler_from_R(Vector<3> &e, const Matrix<3,3> &R);
+float calculate_yaw(float yaw_cur, float yaw_ref, float sim_t);
+
+double wrapMinusPiToPi(const double angle);
+double wrapZeroToTwoPi(const double angle);
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/common/parameter_base.hpp
+++ b/flightlib/include/flightlib/common/parameter_base.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+
+#include <yaml-cpp/yaml.h>
+
+namespace flightlib {
+
+struct ParameterBase {
+  ParameterBase();
+  ParameterBase(const std::string& cfg_path);
+  ParameterBase(const YAML::Node& cfg_node);
+
+  virtual ~ParameterBase();
+
+  virtual bool valid() = 0;
+  virtual bool loadParam() = 0;
+
+  YAML::Node cfg_node_;
+};
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/common/pend_state.hpp
+++ b/flightlib/include/flightlib/common/pend_state.hpp
@@ -0,0 +1,92 @@
+#pragma once
+
+#include <memory>
+
+#include "flightlib/common/types.hpp"
+
+namespace flightlib {
+
+struct PendState {
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+
+  enum IDX : int {
+    POS = 0,
+    POSX = 0,
+    POSY = 1,
+    POSZ = 2,
+    NPOS = 3,
+    ATT = 3,
+    ATTW = 3,
+    ATTX = 4,
+    ATTY = 5,
+    ATTZ = 6,
+    NATT = 4,
+    VEL = 7,
+    VELX = 7,
+    VELY = 8,
+    VELZ = 9,
+    NVEL = 3,
+    OME = 10,
+    OMEX = 10,
+    OMEY = 11,
+    OMEZ = 12,
+    NOME = 3,
+    ACC = 13,
+    ACCX = 13,
+    ACCY = 14,
+    ACCZ = 15,
+    NACC = 3,
+    TAU = 16,
+    TAUX = 16,
+    TAUY = 17,
+    TAUZ = 18,
+    NTAU = 3,
+    BOME = 19,
+    BOMEX = 19,
+    BOMEY = 20,
+    BOMEZ = 21,
+    NBOME = 3,
+    BACC = 22,
+    BACCX = 22,
+    BACCY = 23,
+    BACCZ = 24,
+    NBACC = 3,
+    SIZE = 25,
+    DYN = 19
+  };
+
+  PendState();
+  PendState(const Vector<IDX::SIZE>& x, const Scalar t = NAN);
+  PendState(const PendState& state);
+  ~PendState();
+
+  inline static int size() { return SIZE; }
+  Quaternion q() const;
+  void q(const Quaternion quaternion);
+  Matrix<3, 3> R() const;
+  void setZero();
+
+  inline bool valid() const { return x.allFinite() && std::isfinite(t); }
+
+  Vector<IDX::SIZE> x = Vector<IDX::SIZE>::Constant(NAN);
+  Scalar t{NAN};
+
+  Ref<Vector<3>> p{x.segment<IDX::NPOS>(IDX::POS)};
+  Ref<Vector<4>> qx{x.segment<IDX::NATT>(IDX::ATT)};
+  Ref<Vector<3>> v{x.segment<IDX::NVEL>(IDX::VEL)};
+  Ref<Vector<3>> w{x.segment<IDX::NOME>(IDX::OME)};
+  Ref<Vector<3>> a{x.segment<IDX::NACC>(IDX::ACC)};
+  Ref<Vector<3>> tau{x.segment<IDX::NTAU>(IDX::TAU)};
+  Ref<Vector<3>> bw{x.segment<IDX::NBOME>(IDX::BOME)};
+  Ref<Vector<3>> ba{x.segment<IDX::NBACC>(IDX::BACC)};
+
+  bool operator==(const PendState& rhs) const {
+    return t == rhs.t && x.isApprox(rhs.x, 1e-5);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const PendState& state);
+};
+
+using PS = PendState;
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/common/quad_state.hpp
+++ b/flightlib/include/flightlib/common/quad_state.hpp
@@ -0,0 +1,109 @@
+#pragma once
+
+#include <memory>
+
+#include "flightlib/common/types.hpp"
+
+namespace flightlib {
+
+struct QuadState {
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+
+  enum IDX : int {
+    // position
+    POS = 0,
+    POSX = 0,
+    POSY = 1,
+    POSZ = 2,
+    NPOS = 3,
+    // quaternion
+    ATT = 3,
+    ATTW = 3,
+    ATTX = 4,
+    ATTY = 5,
+    ATTZ = 6,
+    NATT = 4,
+    // linear velocity
+    VEL = 7,
+    VELX = 7,
+    VELY = 8,
+    VELZ = 9,
+    NVEL = 3,
+    // body rate
+    OME = 10,
+    OMEX = 10,
+    OMEY = 11,
+    OMEZ = 12,
+    NOME = 3,
+    // linear acceleration
+    ACC = 13,
+    ACCX = 13,
+    ACCY = 14,
+    ACCZ = 15,
+    NACC = 3,
+    // body-torque
+    TAU = 16,
+    TAUX = 16,
+    TAUY = 17,
+    TAUZ = 18,
+    NTAU = 3,
+    //
+    BOME = 19,
+    BOMEX = 19,
+    BOMEY = 20,
+    BOMEZ = 21,
+    NBOME = 3,
+    //
+    BACC = 22,
+    BACCX = 22,
+    BACCY = 23,
+    BACCZ = 24,
+    NBACC = 3,
+    //
+    SIZE = 25,
+    NDYM = 19
+  };
+
+  QuadState();
+  QuadState(const Vector<IDX::SIZE>& x, const Scalar t = NAN);
+  QuadState(const QuadState& state);
+  ~QuadState();
+
+  inline static int size() { return SIZE; }
+  Quaternion q() const;
+  void q(const Quaternion quaternion);
+  Matrix<3, 3> R() const;
+  void setZero();
+
+  inline bool valid() const { return x.allFinite() && std::isfinite(t); }
+
+  Vector<IDX::SIZE> x = Vector<IDX::SIZE>::Constant(NAN);
+  Scalar t{NAN};
+
+  // position
+  Ref<Vector<3>> p{x.segment<IDX::NPOS>(IDX::POS)};
+  // orientation (quaternion)
+  Ref<Vector<4>> qx{x.segment<IDX::NATT>(IDX::ATT)};
+  // linear velocity
+  Ref<Vector<3>> v{x.segment<IDX::NVEL>(IDX::VEL)};
+  // angular velocity
+  Ref<Vector<3>> w{x.segment<IDX::NOME>(IDX::OME)};
+  // linear accleration
+  Ref<Vector<3>> a{x.segment<IDX::NACC>(IDX::ACC)};
+  // body torque
+  Ref<Vector<3>> tau{x.segment<IDX::NTAU>(IDX::TAU)};
+  //
+  Ref<Vector<3>> bw{x.segment<IDX::NBOME>(IDX::BOME)};
+  //
+  Ref<Vector<3>> ba{x.segment<IDX::NBACC>(IDX::BACC)};
+
+  bool operator==(const QuadState& rhs) const {
+    return t == rhs.t && x.isApprox(rhs.x, 1e-5);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const QuadState& state);
+};
+
+using QS = QuadState;
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/common/timer.hpp
+++ b/flightlib/include/flightlib/common/timer.hpp
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <chrono>
+#include <iostream>
+#include <string>
+
+#include "flightlib/common/types.hpp"
+
+namespace flightlib {
+
+/*
+ * Timer class to perform runtime analytics.
+ *
+ * This timer class provides a simple solution to time code.
+ * Simply construct a timer and call it's `tic()` and `toc()` functions to time
+ * code. It is intended to be used to time multiple calls of a function and not
+ * only reports the `last()` timing, but also statistics such as the `mean()`,
+ * `min()`, `max()` time, the `count()` of calls to the timer , and even
+ * standard deviation `std()`.
+ *
+ * The constructor can take a name for the timer (like "update") and a name for
+ * the module (like "Filter").
+ * After construction it can be `reset()` if needed.
+ *
+ * A simple way to get the timing and stats is `std::cout << timer;` which can
+ * output to arbitrary streams, overloading the stream operator,
+ * or `print()` which always prints to console.
+ *
+ */
+class Timer {
+ public:
+  Timer(const std::string name = "", const std::string module = "");
+  Timer(const Timer& other);
+  ~Timer() {}
+
+  /// Start the timer.
+  void tic();
+
+  /// Stops timer, calculates timing, also tics again.
+  Scalar toc();
+
+  /// Reset saved timings and calls;
+  void reset();
+
+  // Accessors
+  Scalar operator()() const;
+  Scalar mean() const;
+  Scalar last() const;
+  Scalar min() const;
+  Scalar max() const;
+  Scalar std() const;
+  int count() const;
+
+  /// Custom stream operator for outputs.
+  friend std::ostream& operator<<(std::ostream& os, const Timer& timer);
+
+  /// Print timing information to console.
+  void print() const;
+
+ private:
+  std::string name_, module_;
+  using TimePoint = std::chrono::high_resolution_clock::time_point;
+  TimePoint t_start_;
+
+  // Initialize timing to impossible values.
+  Scalar timing_mean_;
+  Scalar timing_last_;
+  Scalar timing_S_;
+  Scalar timing_min_;
+  Scalar timing_max_;
+
+  int n_samples_;
+};
+
+/*
+ * Simple class to time scopes.
+ *
+ * This effectively instantiates a timer and calls `tic()` in its constructor
+ * and `toc()` and ` print()` in its destructor.
+ */
+class ScopedTimer : public Timer {
+ public:
+  ScopedTimer(const std::string name = "", const std::string module = "");
+  ~ScopedTimer();
+};
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/common/types.hpp
+++ b/flightlib/include/flightlib/common/types.hpp
@@ -0,0 +1,101 @@
+#pragma once
+
+#include <eigen3/Eigen/Eigen>
+
+namespace flightlib {
+
+// ------------ General Stuff-------------
+
+// Define the scalar type used.
+using Scalar = float;  // numpy float32
+
+// Define frame id for unity
+using FrameID = uint64_t;
+
+using USecs = uint64_t;
+
+// Define frame id for unity
+using SceneID = size_t;
+
+
+// ------------ Eigen Stuff-------------
+// Define `Dynamic` matrix size.
+static constexpr int Dynamic = Eigen::Dynamic;
+
+// Using shorthand for 'DepthImg<ros, cols>' with scalar type.
+template<int rows = Dynamic, int cols = Dynamic>
+using DepthImgMatrix = Eigen::Matrix<float_t, rows, cols>;
+
+// Using shorthand for `DepthImgVector<ros>` with scalar type.
+template<int rows = Dynamic>
+using DepthImgVector = DepthImgMatrix<rows, 1>;
+
+// Using shorthand for 'DepthImgRowMajor<ros, cols>' with scalar type.
+template<int rows = Dynamic, int cols = Dynamic>
+using DepthImgMatrixRowMajor =
+  Eigen::Matrix<float_t, rows, cols, Eigen::RowMajor>;
+
+// Using shorthand for 'ImgMatrix<ros, cols>' with scalar type.
+template<int rows = Dynamic, int cols = Dynamic>
+using ImgMatrix = Eigen::Matrix<uint8_t, rows, cols>;
+
+// Using shorthand for `ImgVector<ros>` with scalar type.
+template<int rows = Dynamic>
+using ImgVector = ImgMatrix<rows, 1>;
+
+// Using shorthand for 'ImgMatrixRowMajor<ros, cols>' with scalar type.
+template<int rows = Dynamic, int cols = Dynamic>
+using ImgMatrixRowMajor = Eigen::Matrix<uint8_t, rows, cols, Eigen::RowMajor>;
+
+// Using shorthand for `Matrix<rows, cols>` with scalar type.
+template<int rows = Dynamic, int cols = Dynamic>
+using Matrix = Eigen::Matrix<Scalar, rows, cols>;
+
+// Using shorthand for `Matrix<rows, cols>` with scalar type.
+template<int rows = Dynamic, int cols = Dynamic>
+using MatrixRowMajor = Eigen::Matrix<Scalar, rows, cols, Eigen::RowMajor>;
+
+// Using shorthand for `Vector<rows>` with scalar type.
+template<int rows = Dynamic>
+using Vector = Matrix<rows, 1>;
+
+// Vector bool
+template<int rows = Dynamic>
+using BoolVector = Eigen::Matrix<bool, -1, 1>;
+
+// Vector int
+template<int rows = Dynamic>
+using IntVector = Eigen::Matrix<int, -1, 1>;
+
+// Using shorthand for `Array<rows, cols>` with scalar type.
+template<int rows = Dynamic, int cols = rows>
+using Array = Eigen::Array<Scalar, rows, cols>;
+
+// Using `SparseMatrix` with type.
+using SparseMatrix = Eigen::SparseMatrix<Scalar>;
+
+// Using SparseTriplet with type.
+using SparseTriplet = Eigen::Triplet<Scalar>;
+
+// Using `Quaternion` with type.
+using Quaternion = Eigen::Quaternion<Scalar>;
+
+// Using 'AngleAxis' with type
+using AngleAxis = Eigen::AngleAxis<Scalar>;
+
+// Using `Ref` for modifier references.
+template<class Derived>
+using Ref = Eigen::Ref<Derived>;
+
+// // Using `ConstRef` for constant references.
+template<class Derived>
+using ConstRef = const Eigen::Ref<const Derived>;
+
+// Using `Map`.
+template<class Derived>
+using Map = Eigen::Map<Derived>;
+
+static constexpr Scalar Gz = -9.81;
+const Vector<3> GVEC{0.0, 0.0, Gz};
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/controller/PositionCommand.h
+++ b/flightlib/include/flightlib/controller/PositionCommand.h
@@ -0,0 +1,400 @@
+// Generated by gencpp from file quadrotor_msgs/PositionCommand.msg
+// DO NOT EDIT!
+
+
+#ifndef QUADROTOR_MSGS_MESSAGE_POSITIONCOMMAND_H
+#define QUADROTOR_MSGS_MESSAGE_POSITIONCOMMAND_H
+
+
+#include <string>
+#include <vector>
+#include <map>
+
+#include <ros/types.h>
+#include <ros/serialization.h>
+#include <ros/builtin_message_traits.h>
+#include <ros/message_operations.h>
+
+#include <std_msgs/Header.h>
+#include <geometry_msgs/Point.h>
+#include <geometry_msgs/Vector3.h>
+#include <geometry_msgs/Vector3.h>
+
+namespace quadrotor_msgs
+{
+template <class ContainerAllocator>
+struct PositionCommand_
+{
+  typedef PositionCommand_<ContainerAllocator> Type;
+
+  PositionCommand_()
+    : header()
+    , position()
+    , velocity()
+    , acceleration()
+    , yaw(0.0)
+    , yaw_dot(0.0)
+    , kx()
+    , kv()
+    , trajectory_id(0)
+    , trajectory_flag(0)  {
+      kx.assign(0.0);
+
+      kv.assign(0.0);
+  }
+  PositionCommand_(const ContainerAllocator& _alloc)
+    : header(_alloc)
+    , position(_alloc)
+    , velocity(_alloc)
+    , acceleration(_alloc)
+    , yaw(0.0)
+    , yaw_dot(0.0)
+    , kx()
+    , kv()
+    , trajectory_id(0)
+    , trajectory_flag(0)  {
+  (void)_alloc;
+      kx.assign(0.0);
+
+      kv.assign(0.0);
+  }
+
+
+
+   typedef  ::std_msgs::Header_<ContainerAllocator>  _header_type;
+  _header_type header;
+
+   typedef  ::geometry_msgs::Point_<ContainerAllocator>  _position_type;
+  _position_type position;
+
+   typedef  ::geometry_msgs::Vector3_<ContainerAllocator>  _velocity_type;
+  _velocity_type velocity;
+
+   typedef  ::geometry_msgs::Vector3_<ContainerAllocator>  _acceleration_type;
+  _acceleration_type acceleration;
+
+   typedef double _yaw_type;
+  _yaw_type yaw;
+
+   typedef double _yaw_dot_type;
+  _yaw_dot_type yaw_dot;
+
+   typedef boost::array<double, 3>  _kx_type;
+  _kx_type kx;
+
+   typedef boost::array<double, 3>  _kv_type;
+  _kv_type kv;
+
+   typedef uint32_t _trajectory_id_type;
+  _trajectory_id_type trajectory_id;
+
+   typedef uint8_t _trajectory_flag_type;
+  _trajectory_flag_type trajectory_flag;
+
+
+
+// reducing the odds to have name collisions with Windows.h 
+#if defined(_WIN32) && defined(TRAJECTORY_STATUS_EMPTY)
+  #undef TRAJECTORY_STATUS_EMPTY
+#endif
+#if defined(_WIN32) && defined(TRAJECTORY_STATUS_READY)
+  #undef TRAJECTORY_STATUS_READY
+#endif
+#if defined(_WIN32) && defined(TRAJECTORY_STATUS_COMPLETED)
+  #undef TRAJECTORY_STATUS_COMPLETED
+#endif
+#if defined(_WIN32) && defined(TRAJECTROY_STATUS_ABORT)
+  #undef TRAJECTROY_STATUS_ABORT
+#endif
+#if defined(_WIN32) && defined(TRAJECTORY_STATUS_ILLEGAL_START)
+  #undef TRAJECTORY_STATUS_ILLEGAL_START
+#endif
+#if defined(_WIN32) && defined(TRAJECTORY_STATUS_ILLEGAL_FINAL)
+  #undef TRAJECTORY_STATUS_ILLEGAL_FINAL
+#endif
+#if defined(_WIN32) && defined(TRAJECTORY_STATUS_IMPOSSIBLE)
+  #undef TRAJECTORY_STATUS_IMPOSSIBLE
+#endif
+
+  enum {
+    TRAJECTORY_STATUS_EMPTY = 0u,
+    TRAJECTORY_STATUS_READY = 1u,
+    TRAJECTORY_STATUS_COMPLETED = 3u,
+    TRAJECTROY_STATUS_ABORT = 4u,
+    TRAJECTORY_STATUS_ILLEGAL_START = 5u,
+    TRAJECTORY_STATUS_ILLEGAL_FINAL = 6u,
+    TRAJECTORY_STATUS_IMPOSSIBLE = 7u,
+  };
+
+
+  typedef boost::shared_ptr< ::quadrotor_msgs::PositionCommand_<ContainerAllocator> > Ptr;
+  typedef boost::shared_ptr< ::quadrotor_msgs::PositionCommand_<ContainerAllocator> const> ConstPtr;
+
+}; // struct PositionCommand_
+
+typedef ::quadrotor_msgs::PositionCommand_<std::allocator<void> > PositionCommand;
+
+typedef boost::shared_ptr< ::quadrotor_msgs::PositionCommand > PositionCommandPtr;
+typedef boost::shared_ptr< ::quadrotor_msgs::PositionCommand const> PositionCommandConstPtr;
+
+// constants requiring out of line definition
+
+   
+
+   
+
+   
+
+   
+
+   
+
+   
+
+   
+
+
+
+template<typename ContainerAllocator>
+std::ostream& operator<<(std::ostream& s, const ::quadrotor_msgs::PositionCommand_<ContainerAllocator> & v)
+{
+ros::message_operations::Printer< ::quadrotor_msgs::PositionCommand_<ContainerAllocator> >::stream(s, "", v);
+return s;
+}
+
+
+template<typename ContainerAllocator1, typename ContainerAllocator2>
+bool operator==(const ::quadrotor_msgs::PositionCommand_<ContainerAllocator1> & lhs, const ::quadrotor_msgs::PositionCommand_<ContainerAllocator2> & rhs)
+{
+  return lhs.header == rhs.header &&
+    lhs.position == rhs.position &&
+    lhs.velocity == rhs.velocity &&
+    lhs.acceleration == rhs.acceleration &&
+    lhs.yaw == rhs.yaw &&
+    lhs.yaw_dot == rhs.yaw_dot &&
+    lhs.kx == rhs.kx &&
+    lhs.kv == rhs.kv &&
+    lhs.trajectory_id == rhs.trajectory_id &&
+    lhs.trajectory_flag == rhs.trajectory_flag;
+}
+
+template<typename ContainerAllocator1, typename ContainerAllocator2>
+bool operator!=(const ::quadrotor_msgs::PositionCommand_<ContainerAllocator1> & lhs, const ::quadrotor_msgs::PositionCommand_<ContainerAllocator2> & rhs)
+{
+  return !(lhs == rhs);
+}
+
+
+} // namespace quadrotor_msgs
+
+namespace ros
+{
+namespace message_traits
+{
+
+
+
+
+
+template <class ContainerAllocator>
+struct IsMessage< ::quadrotor_msgs::PositionCommand_<ContainerAllocator> >
+  : TrueType
+  { };
+
+template <class ContainerAllocator>
+struct IsMessage< ::quadrotor_msgs::PositionCommand_<ContainerAllocator> const>
+  : TrueType
+  { };
+
+template <class ContainerAllocator>
+struct IsFixedSize< ::quadrotor_msgs::PositionCommand_<ContainerAllocator> >
+  : FalseType
+  { };
+
+template <class ContainerAllocator>
+struct IsFixedSize< ::quadrotor_msgs::PositionCommand_<ContainerAllocator> const>
+  : FalseType
+  { };
+
+template <class ContainerAllocator>
+struct HasHeader< ::quadrotor_msgs::PositionCommand_<ContainerAllocator> >
+  : TrueType
+  { };
+
+template <class ContainerAllocator>
+struct HasHeader< ::quadrotor_msgs::PositionCommand_<ContainerAllocator> const>
+  : TrueType
+  { };
+
+
+template<class ContainerAllocator>
+struct MD5Sum< ::quadrotor_msgs::PositionCommand_<ContainerAllocator> >
+{
+  static const char* value()
+  {
+    return "4712f0609ca29a79af79a35ca3e3967a";
+  }
+
+  static const char* value(const ::quadrotor_msgs::PositionCommand_<ContainerAllocator>&) { return value(); }
+  static const uint64_t static_value1 = 0x4712f0609ca29a79ULL;
+  static const uint64_t static_value2 = 0xaf79a35ca3e3967aULL;
+};
+
+template<class ContainerAllocator>
+struct DataType< ::quadrotor_msgs::PositionCommand_<ContainerAllocator> >
+{
+  static const char* value()
+  {
+    return "quadrotor_msgs/PositionCommand";
+  }
+
+  static const char* value(const ::quadrotor_msgs::PositionCommand_<ContainerAllocator>&) { return value(); }
+};
+
+template<class ContainerAllocator>
+struct Definition< ::quadrotor_msgs::PositionCommand_<ContainerAllocator> >
+{
+  static const char* value()
+  {
+    return "Header header\n"
+"geometry_msgs/Point position\n"
+"geometry_msgs/Vector3 velocity\n"
+"geometry_msgs/Vector3 acceleration\n"
+"float64 yaw\n"
+"float64 yaw_dot\n"
+"float64[3] kx\n"
+"float64[3] kv \n"
+"\n"
+"uint32 trajectory_id\n"
+"\n"
+"uint8 TRAJECTORY_STATUS_EMPTY = 0\n"
+"uint8 TRAJECTORY_STATUS_READY = 1\n"
+"uint8 TRAJECTORY_STATUS_COMPLETED = 3\n"
+"uint8 TRAJECTROY_STATUS_ABORT = 4\n"
+"uint8 TRAJECTORY_STATUS_ILLEGAL_START = 5\n"
+"uint8 TRAJECTORY_STATUS_ILLEGAL_FINAL = 6\n"
+"uint8 TRAJECTORY_STATUS_IMPOSSIBLE = 7\n"
+"\n"
+"# Its ID number will start from 1, allowing you comparing it with 0.\n"
+"uint8 trajectory_flag\n"
+"\n"
+"================================================================================\n"
+"MSG: std_msgs/Header\n"
+"# Standard metadata for higher-level stamped data types.\n"
+"# This is generally used to communicate timestamped data \n"
+"# in a particular coordinate frame.\n"
+"# \n"
+"# sequence ID: consecutively increasing ID \n"
+"uint32 seq\n"
+"#Two-integer timestamp that is expressed as:\n"
+"# * stamp.sec: seconds (stamp_secs) since epoch (in Python the variable is called 'secs')\n"
+"# * stamp.nsec: nanoseconds since stamp_secs (in Python the variable is called 'nsecs')\n"
+"# time-handling sugar is provided by the client library\n"
+"time stamp\n"
+"#Frame this data is associated with\n"
+"string frame_id\n"
+"\n"
+"================================================================================\n"
+"MSG: geometry_msgs/Point\n"
+"# This contains the position of a point in free space\n"
+"float64 x\n"
+"float64 y\n"
+"float64 z\n"
+"\n"
+"================================================================================\n"
+"MSG: geometry_msgs/Vector3\n"
+"# This represents a vector in free space. \n"
+"# It is only meant to represent a direction. Therefore, it does not\n"
+"# make sense to apply a translation to it (e.g., when applying a \n"
+"# generic rigid transformation to a Vector3, tf2 will only apply the\n"
+"# rotation). If you want your data to be translatable too, use the\n"
+"# geometry_msgs/Point message instead.\n"
+"\n"
+"float64 x\n"
+"float64 y\n"
+"float64 z\n"
+;
+  }
+
+  static const char* value(const ::quadrotor_msgs::PositionCommand_<ContainerAllocator>&) { return value(); }
+};
+
+} // namespace message_traits
+} // namespace ros
+
+namespace ros
+{
+namespace serialization
+{
+
+  template<class ContainerAllocator> struct Serializer< ::quadrotor_msgs::PositionCommand_<ContainerAllocator> >
+  {
+    template<typename Stream, typename T> inline static void allInOne(Stream& stream, T m)
+    {
+      stream.next(m.header);
+      stream.next(m.position);
+      stream.next(m.velocity);
+      stream.next(m.acceleration);
+      stream.next(m.yaw);
+      stream.next(m.yaw_dot);
+      stream.next(m.kx);
+      stream.next(m.kv);
+      stream.next(m.trajectory_id);
+      stream.next(m.trajectory_flag);
+    }
+
+    ROS_DECLARE_ALLINONE_SERIALIZER
+  }; // struct PositionCommand_
+
+} // namespace serialization
+} // namespace ros
+
+namespace ros
+{
+namespace message_operations
+{
+
+template<class ContainerAllocator>
+struct Printer< ::quadrotor_msgs::PositionCommand_<ContainerAllocator> >
+{
+  template<typename Stream> static void stream(Stream& s, const std::string& indent, const ::quadrotor_msgs::PositionCommand_<ContainerAllocator>& v)
+  {
+    s << indent << "header: ";
+    s << std::endl;
+    Printer< ::std_msgs::Header_<ContainerAllocator> >::stream(s, indent + "  ", v.header);
+    s << indent << "position: ";
+    s << std::endl;
+    Printer< ::geometry_msgs::Point_<ContainerAllocator> >::stream(s, indent + "  ", v.position);
+    s << indent << "velocity: ";
+    s << std::endl;
+    Printer< ::geometry_msgs::Vector3_<ContainerAllocator> >::stream(s, indent + "  ", v.velocity);
+    s << indent << "acceleration: ";
+    s << std::endl;
+    Printer< ::geometry_msgs::Vector3_<ContainerAllocator> >::stream(s, indent + "  ", v.acceleration);
+    s << indent << "yaw: ";
+    Printer<double>::stream(s, indent + "  ", v.yaw);
+    s << indent << "yaw_dot: ";
+    Printer<double>::stream(s, indent + "  ", v.yaw_dot);
+    s << indent << "kx[]" << std::endl;
+    for (size_t i = 0; i < v.kx.size(); ++i)
+    {
+      s << indent << "  kx[" << i << "]: ";
+      Printer<double>::stream(s, indent + "  ", v.kx[i]);
+    }
+    s << indent << "kv[]" << std::endl;
+    for (size_t i = 0; i < v.kv.size(); ++i)
+    {
+      s << indent << "  kv[" << i << "]: ";
+      Printer<double>::stream(s, indent + "  ", v.kv[i]);
+    }
+    s << indent << "trajectory_id: ";
+    Printer<uint32_t>::stream(s, indent + "  ", v.trajectory_id);
+    s << indent << "trajectory_flag: ";
+    Printer<uint8_t>::stream(s, indent + "  ", v.trajectory_flag);
+  }
+};
+
+} // namespace message_operations
+} // namespace ros
+
+#endif // QUADROTOR_MSGS_MESSAGE_POSITIONCOMMAND_H
--- a/flightlib/include/flightlib/controller/ctrl_ref.h
+++ b/flightlib/include/flightlib/controller/ctrl_ref.h
@@ -0,0 +1,290 @@
+// Generated by gencpp from file quad_pos_ctrl/ctrl_ref.msg
+// DO NOT EDIT!
+
+
+#ifndef QUAD_POS_CTRL_MESSAGE_CTRL_REF_H
+#define QUAD_POS_CTRL_MESSAGE_CTRL_REF_H
+
+
+#include <string>
+#include <vector>
+#include <map>
+
+#include <ros/types.h>
+#include <ros/serialization.h>
+#include <ros/builtin_message_traits.h>
+#include <ros/message_operations.h>
+
+#include <std_msgs/Header.h>
+
+namespace quad_pos_ctrl
+{
+template <class ContainerAllocator>
+struct ctrl_ref_
+{
+  typedef ctrl_ref_<ContainerAllocator> Type;
+
+  ctrl_ref_()
+    : header()
+    , pos_ref()
+    , vel_ref()
+    , acc_ref()
+    , yaw_ref(0.0)
+    , ref_mask(0)  {
+      pos_ref.assign(0.0);
+
+      vel_ref.assign(0.0);
+
+      acc_ref.assign(0.0);
+  }
+  ctrl_ref_(const ContainerAllocator& _alloc)
+    : header(_alloc)
+    , pos_ref()
+    , vel_ref()
+    , acc_ref()
+    , yaw_ref(0.0)
+    , ref_mask(0)  {
+  (void)_alloc;
+      pos_ref.assign(0.0);
+
+      vel_ref.assign(0.0);
+
+      acc_ref.assign(0.0);
+  }
+
+
+
+   typedef  ::std_msgs::Header_<ContainerAllocator>  _header_type;
+  _header_type header;
+
+   typedef boost::array<float, 3>  _pos_ref_type;
+  _pos_ref_type pos_ref;
+
+   typedef boost::array<float, 3>  _vel_ref_type;
+  _vel_ref_type vel_ref;
+
+   typedef boost::array<float, 3>  _acc_ref_type;
+  _acc_ref_type acc_ref;
+
+   typedef float _yaw_ref_type;
+  _yaw_ref_type yaw_ref;
+
+   typedef uint8_t _ref_mask_type;
+  _ref_mask_type ref_mask;
+
+
+
+  enum {
+    POS_CTRL_VALIED = 1u,
+    VEL_CTRL_VALIED = 2u,
+    ACC_CTRL_VALIED = 4u,
+  };
+
+
+  typedef boost::shared_ptr< ::quad_pos_ctrl::ctrl_ref_<ContainerAllocator> > Ptr;
+  typedef boost::shared_ptr< ::quad_pos_ctrl::ctrl_ref_<ContainerAllocator> const> ConstPtr;
+
+}; // struct ctrl_ref_
+
+typedef ::quad_pos_ctrl::ctrl_ref_<std::allocator<void> > ctrl_ref;
+
+typedef boost::shared_ptr< ::quad_pos_ctrl::ctrl_ref > ctrl_refPtr;
+typedef boost::shared_ptr< ::quad_pos_ctrl::ctrl_ref const> ctrl_refConstPtr;
+
+// constants requiring out of line definition
+
+   
+
+   
+
+   
+
+
+
+template<typename ContainerAllocator>
+std::ostream& operator<<(std::ostream& s, const ::quad_pos_ctrl::ctrl_ref_<ContainerAllocator> & v)
+{
+ros::message_operations::Printer< ::quad_pos_ctrl::ctrl_ref_<ContainerAllocator> >::stream(s, "", v);
+return s;
+}
+
+} // namespace quad_pos_ctrl
+
+namespace ros
+{
+namespace message_traits
+{
+
+
+
+// BOOLTRAITS {'IsFixedSize': False, 'IsMessage': True, 'HasHeader': True}
+// {'std_msgs': ['/opt/ros/kinetic/share/std_msgs/cmake/../msg'], 'geometry_msgs': ['/opt/ros/kinetic/share/geometry_msgs/cmake/../msg'], 'quad_pos_ctrl': ['/home/yiqianlingqi/Work/pos_px4lidar_ctrl_ws/src/quad_pos_ctrl/msg']}
+
+// !!!!!!!!!!! ['__class__', '__delattr__', '__dict__', '__doc__', '__eq__', '__format__', '__getattribute__', '__hash__', '__init__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_parsed_fields', 'constants', 'fields', 'full_name', 'has_header', 'header_present', 'names', 'package', 'parsed_fields', 'short_name', 'text', 'types']
+
+
+
+
+template <class ContainerAllocator>
+struct IsFixedSize< ::quad_pos_ctrl::ctrl_ref_<ContainerAllocator> >
+  : FalseType
+  { };
+
+template <class ContainerAllocator>
+struct IsFixedSize< ::quad_pos_ctrl::ctrl_ref_<ContainerAllocator> const>
+  : FalseType
+  { };
+
+template <class ContainerAllocator>
+struct IsMessage< ::quad_pos_ctrl::ctrl_ref_<ContainerAllocator> >
+  : TrueType
+  { };
+
+template <class ContainerAllocator>
+struct IsMessage< ::quad_pos_ctrl::ctrl_ref_<ContainerAllocator> const>
+  : TrueType
+  { };
+
+template <class ContainerAllocator>
+struct HasHeader< ::quad_pos_ctrl::ctrl_ref_<ContainerAllocator> >
+  : TrueType
+  { };
+
+template <class ContainerAllocator>
+struct HasHeader< ::quad_pos_ctrl::ctrl_ref_<ContainerAllocator> const>
+  : TrueType
+  { };
+
+
+template<class ContainerAllocator>
+struct MD5Sum< ::quad_pos_ctrl::ctrl_ref_<ContainerAllocator> >
+{
+  static const char* value()
+  {
+    return "c44a0c7d669f499d943b0196aea84d57";
+  }
+
+  static const char* value(const ::quad_pos_ctrl::ctrl_ref_<ContainerAllocator>&) { return value(); }
+  static const uint64_t static_value1 = 0xc44a0c7d669f499dULL;
+  static const uint64_t static_value2 = 0x943b0196aea84d57ULL;
+};
+
+template<class ContainerAllocator>
+struct DataType< ::quad_pos_ctrl::ctrl_ref_<ContainerAllocator> >
+{
+  static const char* value()
+  {
+    return "quad_pos_ctrl/ctrl_ref";
+  }
+
+  static const char* value(const ::quad_pos_ctrl::ctrl_ref_<ContainerAllocator>&) { return value(); }
+};
+
+template<class ContainerAllocator>
+struct Definition< ::quad_pos_ctrl::ctrl_ref_<ContainerAllocator> >
+{
+  static const char* value()
+  {
+    return "# enum mask\n\
+uint8 POS_CTRL_VALIED = 1\n\
+uint8 VEL_CTRL_VALIED = 2\n\
+uint8 ACC_CTRL_VALIED = 4\n\
+\n\
+# \n\
+Header     header\n\
+float32[3] pos_ref\n\
+float32[3] vel_ref\n\
+float32[3] acc_ref\n\
+float32    yaw_ref\n\
+uint8      ref_mask\n\
+================================================================================\n\
+MSG: std_msgs/Header\n\
+# Standard metadata for higher-level stamped data types.\n\
+# This is generally used to communicate timestamped data \n\
+# in a particular coordinate frame.\n\
+# \n\
+# sequence ID: consecutively increasing ID \n\
+uint32 seq\n\
+#Two-integer timestamp that is expressed as:\n\
+# * stamp.sec: seconds (stamp_secs) since epoch (in Python the variable is called 'secs')\n\
+# * stamp.nsec: nanoseconds since stamp_secs (in Python the variable is called 'nsecs')\n\
+# time-handling sugar is provided by the client library\n\
+time stamp\n\
+#Frame this data is associated with\n\
+# 0: no frame\n\
+# 1: global frame\n\
+string frame_id\n\
+";
+  }
+
+  static const char* value(const ::quad_pos_ctrl::ctrl_ref_<ContainerAllocator>&) { return value(); }
+};
+
+} // namespace message_traits
+} // namespace ros
+
+namespace ros
+{
+namespace serialization
+{
+
+  template<class ContainerAllocator> struct Serializer< ::quad_pos_ctrl::ctrl_ref_<ContainerAllocator> >
+  {
+    template<typename Stream, typename T> inline static void allInOne(Stream& stream, T m)
+    {
+      stream.next(m.header);
+      stream.next(m.pos_ref);
+      stream.next(m.vel_ref);
+      stream.next(m.acc_ref);
+      stream.next(m.yaw_ref);
+      stream.next(m.ref_mask);
+    }
+
+    ROS_DECLARE_ALLINONE_SERIALIZER
+  }; // struct ctrl_ref_
+
+} // namespace serialization
+} // namespace ros
+
+namespace ros
+{
+namespace message_operations
+{
+
+template<class ContainerAllocator>
+struct Printer< ::quad_pos_ctrl::ctrl_ref_<ContainerAllocator> >
+{
+  template<typename Stream> static void stream(Stream& s, const std::string& indent, const ::quad_pos_ctrl::ctrl_ref_<ContainerAllocator>& v)
+  {
+    s << indent << "header: ";
+    s << std::endl;
+    Printer< ::std_msgs::Header_<ContainerAllocator> >::stream(s, indent + "  ", v.header);
+    s << indent << "pos_ref[]" << std::endl;
+    for (size_t i = 0; i < v.pos_ref.size(); ++i)
+    {
+      s << indent << "  pos_ref[" << i << "]: ";
+      Printer<float>::stream(s, indent + "  ", v.pos_ref[i]);
+    }
+    s << indent << "vel_ref[]" << std::endl;
+    for (size_t i = 0; i < v.vel_ref.size(); ++i)
+    {
+      s << indent << "  vel_ref[" << i << "]: ";
+      Printer<float>::stream(s, indent + "  ", v.vel_ref[i]);
+    }
+    s << indent << "acc_ref[]" << std::endl;
+    for (size_t i = 0; i < v.acc_ref.size(); ++i)
+    {
+      s << indent << "  acc_ref[" << i << "]: ";
+      Printer<float>::stream(s, indent + "  ", v.acc_ref[i]);
+    }
+    s << indent << "yaw_ref: ";
+    Printer<float>::stream(s, indent + "  ", v.yaw_ref);
+    s << indent << "ref_mask: ";
+    Printer<uint8_t>::stream(s, indent + "  ", v.ref_mask);
+  }
+};
+
+} // namespace message_operations
+} // namespace ros
+
+#endif // QUAD_POS_CTRL_MESSAGE_CTRL_REF_H
--- a/flightlib/include/flightlib/dynamics/dynamics_base.hpp
+++ b/flightlib/include/flightlib/dynamics/dynamics_base.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "flightlib/common/types.hpp"
+
+namespace flightlib {
+
+class DynamicsBase {
+	EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+   public:
+	using DynamicsFunction = std::function<bool(const Ref<const Vector<>>, Ref<Vector<>>)>;
+
+	DynamicsBase();
+	virtual ~DynamicsBase();
+
+	// public get function
+	virtual DynamicsFunction getDynamicsFunction() const = 0;
+
+   private:
+};
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/dynamics/quadrotor_dynamics.hpp
+++ b/flightlib/include/flightlib/dynamics/quadrotor_dynamics.hpp
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <yaml-cpp/yaml.h>
+
+#include <iostream>
+#include <memory>
+
+#include "flightlib/common/logger.hpp"
+#include "flightlib/common/math.hpp"
+#include "flightlib/common/quad_state.hpp"
+#include "flightlib/dynamics/dynamics_base.hpp"
+
+namespace flightlib {
+
+class QuadrotorDynamics : DynamicsBase {
+   public:
+	EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+	QuadrotorDynamics(const Scalar mass = 1.0, const Scalar arm_l = 0.2);
+	~QuadrotorDynamics();
+
+	// dynamics function
+	bool dState(const QuadState& state, QuadState* derivative) const;
+	bool dState(const Ref<const Vector<QuadState::SIZE>> state, Ref<Vector<QuadState::SIZE>> derivative) const;
+
+	// public get function
+	DynamicsFunction getDynamicsFunction() const;
+
+	// help functions
+	bool valid() const;
+	bool updateParams(const YAML::Node& params);
+
+	// Helpers to apply limits.
+	Vector<4> clampThrust(const Vector<4> thrusts) const;
+	Scalar clampThrust(const Scalar thrust) const;
+	Scalar clampCollectiveThrust(const Scalar thrust) const;
+
+	Vector<4> clampMotorOmega(const Vector<4>& omega) const;
+	Vector<3> clampBodyrates(const Vector<3>& omega) const;
+
+	inline Scalar collective_thrust_min() const { return 4.0 * thrust_min_; }
+	inline Scalar collective_thrust_max() const { return 4.0 * thrust_max_; }
+
+	// Helpers for conversion
+	Vector<4> motorOmegaToThrust(const Vector<4>& omega) const;
+	Vector<4> motorThrustToOmega(const Vector<4>& thrusts) const;
+	Matrix<4, 4> getAllocationMatrix() const;
+
+	//
+	inline Scalar getMass(void) const { return mass_; };
+	inline Scalar getArmLength(void) const { return arm_l_; };
+	inline Scalar getMotorTauInv() const { return motor_tau_inv_; };
+	inline Matrix<3, 3> getJ(void) const { return J_; };
+	inline Matrix<3, 3> getJInv(void) const { return J_inv_; };
+	inline Vector<3> getOmegaMax(void) const { return omega_max_; };
+
+	bool setMass(const Scalar mass);
+	bool setArmLength(const Scalar arm_length);
+	bool setMotortauInv(const Scalar tau_inv);
+
+	friend std::ostream& operator<<(std::ostream& os, const QuadrotorDynamics& quad_dymaics);
+
+   private:
+	bool updateInertiaMarix();
+	Scalar mass_;
+	Scalar arm_l_;
+	Matrix<3, 4> t_BM_;
+	Matrix<3, 3> J_;
+	Matrix<3, 3> J_inv_;
+
+	// motors
+	Scalar motor_omega_min_;
+	Scalar motor_omega_max_;
+	Scalar motor_tau_inv_;
+
+	// Propellers
+	Vector<3> thrust_map_;
+	Scalar kappa_;
+	Scalar thrust_min_;
+	Scalar thrust_max_;
+	Scalar collective_thrust_min_;
+	Scalar collective_thrust_max_;
+
+	// Quadrotor limits
+	Vector<3> omega_max_;
+};
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/envs/env_base.hpp
+++ b/flightlib/include/flightlib/envs/env_base.hpp
@@ -0,0 +1,70 @@
+//
+// This is inspired by RaiGym, thanks.
+//
+#pragma once
+
+#include <unistd.h>
+#include <yaml-cpp/yaml.h>
+
+#include <memory>
+#include <random>
+#include <unordered_map>
+#include <vector>
+
+#include "flightlib/common/types.hpp"
+
+namespace flightlib {
+
+class EnvBase {
+   public:
+	EnvBase() : obs_dim_(0), act_dim_(0), rew_dim_(0), sim_dt_(0.0), img_width_(0), img_height_(0) {};
+	virtual ~EnvBase() {};
+
+	// (pure virtual) public methods (has to be implemented by child classes)
+	virtual bool reset(Ref<Vector<>> obs, const bool random = true)                     = 0;
+	virtual bool step(const Ref<Vector<>> act, Ref<Vector<>> obs, Ref<Vector<>> reward) = 0;
+	virtual bool getObs(Ref<Vector<>> obs)                                              = 0;
+
+	// (virtual) public methods (implementations are optional.)
+	virtual bool getDepthImage(Ref<DepthImgVector<>> img) { return false; }
+	virtual void close() {};
+	// virtual void render();
+	virtual void updateExtraInfo() {};
+	virtual bool isTerminalState(Scalar &reward) {
+		reward = 0.f;
+		return false;
+	}
+
+	// auxilirary functions
+	inline void setSeed(const int seed) { std::srand(seed); };
+	inline int getObsDim() { return obs_dim_; };
+	inline int getActDim() { return act_dim_; };
+	inline int getRewDim() { return rew_dim_; };
+	inline int getImgWidth() { return img_width_; };
+	inline int getImgHeight() { return img_height_; };
+	inline Scalar getSimTimeStep() { return sim_dt_; };
+	inline int getExtraInfoDim() { return extra_info_.size(); };
+
+	// public variables
+	std::unordered_map<std::string, float> extra_info_;
+
+   protected:
+	// observation and action dimenstions (for Reinforcement learning)
+	int obs_dim_;
+	int act_dim_;
+	int rew_dim_;
+
+	int img_width_;
+	int img_height_;
+
+	// control time step
+	Scalar sim_dt_{0.02};
+
+	// random variable generator
+	std::normal_distribution<Scalar> norm_dist_{0.0, 1.0};
+	std::uniform_real_distribution<Scalar> uniform_dist_{-1.0, 1.0};
+	std::random_device rd_;
+	std::mt19937 random_gen_{rd_()};
+};
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/envs/quadrotor_env.hpp
+++ b/flightlib/include/flightlib/envs/quadrotor_env.hpp
@@ -0,0 +1,119 @@
+#pragma once
+
+// std lib
+#include <pcl/point_types.h>
+#include <pcl/search/kdtree.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+#include <yaml-cpp/yaml.h>
+
+#include <cmath>
+#include <iostream>
+// flightlib
+#include "flightlib/bridges/unity_bridge.hpp"
+#include "flightlib/common/command.hpp"
+#include "flightlib/common/logger.hpp"
+#include "flightlib/common/math.hpp"
+#include "flightlib/common/quad_state.hpp"
+#include "flightlib/common/types.hpp"
+#include "flightlib/envs/env_base.hpp"
+#include "flightlib/grad_traj_optimization/traj_optimization_bridge.h"
+#include "flightlib/objects/quadrotor.hpp"
+#include "flightlib/sensors/sgm_gpu/sgm_gpu.h"
+
+namespace flightlib {
+
+enum Ctl : int {
+	kNObs = 13,  // observation dim
+	kNAct = 9,   // action dim
+};
+
+class QuadrotorEnv final : public EnvBase {
+   public:
+	EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+
+	QuadrotorEnv();
+	QuadrotorEnv(const std::string &cfg_path);
+	~QuadrotorEnv();
+
+	/* set functions */
+	bool reset(Ref<Vector<>> obs, const bool random = true) override;
+	void setState(ConstRef<Vector<>> state);
+	void setGoal(ConstRef<Vector<>> goal);
+	void setMapID(int id);
+	void setDAggerMode(bool dagger_mode) { dagger_mode_ = dagger_mode; }
+	bool step(const Ref<Vector<>> act, Ref<Vector<>> obs, Ref<Vector<>> reward) override;
+	void setInputCloud(const pcl::PointCloud<pcl::PointXYZ> &point_in);
+	void setESDF(const std::vector<float> &esdf_map, const pcl::PointCloud<pcl::PointXYZ> &point_in);
+	void addKdtree(std::shared_ptr<pcl::search::KdTree<pcl::PointXYZ>> kdtree) { kdtrees.push_back(kdtree); }
+	void addESDFMap(std::shared_ptr<sdf_tools::SignedDistanceField> esdf_map) { esdf_maps.push_back(esdf_map); }
+	void addMapSize(Eigen::Vector3d map_boundary_min, Eigen::Vector3d map_boundary_max) {
+		min_map_boundaries.push_back(map_boundary_min);
+		max_map_boundaries.push_back(map_boundary_max);
+	}
+
+	/* get functions */
+	bool getObs(Ref<Vector<>> obs) override;
+	bool getAct(Ref<Vector<>> act) const;
+	bool getRGBImage(Ref<ImgVector<>> img, const bool rgb);
+	bool getDepthImage(Ref<DepthImgVector<>> img) override;
+	bool getStereoImage(Ref<DepthImgVector<>> img);
+	int getMapNum() { return kdtrees.size(); }
+	void getCostAndGradient(ConstRef<Vector<>> dp, int id, float &cost, Ref<Vector<>> grad);
+	inline std::vector<std::string> getRewardNames() { return reward_names_; }
+	void getWorldBox(Ref<Vector<>> world_box) {
+		world_box << world_box_(0), world_box_(1), world_box_(2), world_box_(3), world_box_(4), world_box_(5);  // xyz_min, xyz_max
+	};
+
+	/* other functions */
+	void collisionCheck(float dis = 0.2);
+	bool configCamera(const YAML::Node &cfg_node);
+	bool loadParam(const YAML::Node &cfg);
+	void computeDepthImage(const cv::Mat &left_frame, const cv::Mat &right_frame, cv::Mat *const depth);
+	bool isTerminalState(Scalar &reward) override;
+	void addObjectsToUnity(std::shared_ptr<UnityBridge> bridge) { bridge->addQuadrotor(quadrotor_ptr_); }
+	void runControlAndUpdateState(Eigen::Vector3f p_ref, Eigen::Vector3f v_ref, Eigen::Vector3f a_ref);
+
+   private:
+	// quadrotor state and observation
+	std::shared_ptr<Quadrotor> quadrotor_ptr_;
+	QuadState quad_state_;
+	Vector<kNObs> quad_obs_;
+	Vector<kNAct> quad_act_;
+	Logger logger_{"QaudrotorEnv"};
+	Eigen::Vector3f desired_p_, desired_v_, desired_a_;
+
+	// map
+	Matrix<3, 2> world_box_;
+	Vector<3> center_, scale_;
+	std::vector<std::shared_ptr<pcl::search::KdTree<pcl::PointXYZ>>> kdtrees;
+	std::vector<std::shared_ptr<sdf_tools::SignedDistanceField>> esdf_maps;
+	std::vector<Eigen::Vector3d> min_map_boundaries, max_map_boundaries;
+
+	// camera params
+	Scalar fov_;
+	int width_, height_;
+	Scalar stereo_baseline_;
+	std::shared_ptr<sgm_gpu::SgmGpu> sgm_;
+	cv::Mat rgb_img_, gray_img_, depth_img_;
+	std::shared_ptr<RGBCamera> rgb_camera_left, rgb_camera_right;
+
+	// trajectory optimization
+	int map_idx_{0};
+	Vector<3> goal_;
+	TrajOptimizationBridge *traj_opt_bridge;
+
+	// data collection
+	Scalar roll_var_, pitch_var_;
+
+	// others
+	int steps_;
+	YAML::Node cfg_;
+	bool is_collision_;
+	Scalar nearest_obstacle{10};
+	std::vector<std::string> reward_names_;
+	bool collect_data_, dagger_mode_{false};
+};
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/envs/vec_env.hpp
+++ b/flightlib/include/flightlib/envs/vec_env.hpp
@@ -0,0 +1,106 @@
+//
+// This is inspired by RaiGym, thanks.
+//
+#pragma once
+
+// std
+#include <omp.h>
+#include <time.h>
+
+#include <filesystem>
+#include <memory>
+#include <regex>
+
+// pcl
+#include <pcl/io/ply_io.h>
+#include <pcl/point_types.h>
+#include <pcl/search/kdtree.h>
+
+// flightlib
+#include "flightlib/bridges/unity_bridge.hpp"
+#include "flightlib/common/logger.hpp"
+#include "flightlib/common/types.hpp"
+#include "flightlib/envs/env_base.hpp"
+#include "flightlib/envs/quadrotor_env.hpp"
+#include "flightlib/grad_traj_optimization/traj_optimization_bridge.h"
+
+namespace flightlib {
+
+template<typename EnvBase>
+class VecEnv {
+   public:
+	VecEnv();
+	VecEnv(const std::string& cfgs, const bool from_file = true);
+	VecEnv(const YAML::Node& cfgs_node);
+	~VecEnv();
+
+	/* unity functions */
+	bool connectUnity();
+	void disconnectUnity();
+	bool setUnity(bool render);
+	void render();
+	bool spawnTrees();
+	bool savePointcloud(int ply_id);
+	bool spawnTreesAndSavePointcloud(int ply_id_in = -1, float spacing = -1);
+	void close();
+	void setSeed(const int seed);
+
+	/* set functions */
+	bool reset(Ref<MatrixRowMajor<>> obs);
+	bool step(Ref<MatrixRowMajor<>> act, Ref<MatrixRowMajor<>> obs, Ref<MatrixRowMajor<>> reward, Ref<BoolVector<>> done);
+	void perAgentStep(int agent_id, Ref<MatrixRowMajor<>> act, Ref<MatrixRowMajor<>> obs, Ref<MatrixRowMajor<>> reward, Ref<BoolVector<>> done);
+	bool setState(ConstRef<MatrixRowMajor<>> state);  // World Frame
+	bool setGoal(ConstRef<MatrixRowMajor<>> goal);    // World Frame
+	void setMapID(ConstRef<IntVector<>> id);
+
+	/* get functions */
+	inline int getObsDim(void) { return obs_dim_; };
+	inline int getActDim(void) { return act_dim_; };
+	inline int getRewDim(void) const { return rew_dim_; };
+	inline int getImgHeight(void) const { return img_height_; };
+	inline int getImgWidth(void) const { return img_width_; };
+	inline int getNumOfEnvs(void) { return envs_.size(); };
+	inline std::vector<std::string> getRewardNames(void) { return envs_[0]->getRewardNames(); };
+	inline void getWorldBox(Ref<Vector<>> world_box) { envs_[0]->getWorldBox(world_box); };
+	void getObs(Ref<MatrixRowMajor<>> obs);
+	bool getRGBImage(Ref<ImgMatrixRowMajor<>> img, const bool rgb_image);
+	bool getStereoImage(Ref<DepthImgMatrixRowMajor<>> img);
+	bool getDepthImage(Ref<DepthImgMatrixRowMajor<>> img);
+	void getCostAndGradient(ConstRef<MatrixRowMajor<>> dp, ConstRef<IntVector<>> traj_id, Ref<Vector<>> cost,
+	                        Ref<MatrixRowMajor<>> grad);  // Body Frame
+
+	/* other functions */
+	void init(void);
+	void generateMaps();
+	int extract_number(const std::string& filename);
+
+   private:
+	// create objects
+	Logger logger_{"VecEnv"};
+	std::vector<std::unique_ptr<EnvBase>> envs_;
+
+	// Flightmare(Unity3D)
+	std::shared_ptr<UnityBridge> unity_bridge_ptr_;
+	SceneID scene_id_{UnityScene::WAREHOUSE};
+	bool unity_ready_{false};
+	bool unity_render_{false};
+	FrameID frameID{1};
+	RenderMessage_t unity_output_;
+	uint16_t receive_id_{0};
+
+	// scenario generation
+	Scalar avg_tree_spacing_;
+	Vector<3> bounding_box_, bounding_box_origin_;
+	Scalar pointcloud_resolution_;
+
+	// other variables
+	std::string ply_path_;
+	bool dagger_mode_{false}, supervised_mode_{false};
+	int seed_, num_envs_, obs_dim_, act_dim_, rew_dim_, num_threads_;
+	int img_width_, img_height_;
+
+	// yaml configurations
+	YAML::Node cfg_;
+};
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/grad_traj_optimization/grad_traj_optimizer.h
+++ b/flightlib/include/flightlib/grad_traj_optimization/grad_traj_optimizer.h
@@ -0,0 +1,115 @@
+/*
+    This code is modified from https://github.com/HKUST-Aerial-Robotics/grad_traj_optimization, thanks to their excellent work.
+*/
+
+#ifndef _GRAD_TRAJ_OPTIMIZER_H_
+#define _GRAD_TRAJ_OPTIMIZER_H_
+
+#include <yaml-cpp/yaml.h>
+
+#include <Eigen/Eigen>
+
+#include "flightlib/grad_traj_optimization/qp_generator.h"
+#include "sdf_tools/collision_map.hpp"
+#include "sdf_tools/sdf.hpp"
+
+#define GDTB getDistanceToBoundary
+
+using namespace std;
+
+class GradTrajOptimizer {
+   public:
+	GradTrajOptimizer(const YAML::Node &cfg);
+
+	void getCoefficient(Eigen::MatrixXd &coeff) { coeff = this->coeff; };
+
+	double getCost() { return this->min_f; }
+
+	void getSegmentTime(Eigen::VectorXd &T) { T = this->Time; }
+
+	void setSignedDistanceField(std::shared_ptr<sdf_tools::SignedDistanceField> s, double res);
+
+	void setGoal(Eigen::Vector3d goal);
+
+	void setBoundary(Eigen::Vector3d min, Eigen::Vector3d max);
+
+	void getCostAndGradient(const std::vector<double> &df, const std::vector<double> &dp, double &cost, std::vector<double> &grad) const;
+
+	/** convert derivatives of end points to polynomials coefficient */
+	void getCoefficientFromDerivative(Eigen::MatrixXd &coeff, const std::vector<double> &_dp) const;
+
+   private:
+	/** signed distance field */
+	double resolution;
+	Eigen::Vector3d map_boundary_min, map_boundary_max;
+	std::shared_ptr<sdf_tools::SignedDistanceField> sdf;
+	mutable Eigen::VectorXd boundary;  // min x max x... min z,max z
+
+	/** coefficient of polynomials*/
+	mutable Eigen::MatrixXd coeff;
+
+	/** important matrix and variables*/
+	Eigen::MatrixXd A;
+	Eigen::MatrixXd C;
+	Eigen::MatrixXd L;
+	Eigen::MatrixXd R;
+	Eigen::MatrixXd Rff;
+	Eigen::MatrixXd Rpp;
+	Eigen::MatrixXd Rpf;
+	Eigen::MatrixXd Rfp;
+	Eigen::VectorXd Time;
+	Eigen::MatrixXd V;
+	mutable Eigen::MatrixXd Df;
+	Eigen::MatrixXd Dp;
+	Eigen::MatrixXd path;
+	Eigen::Vector3d goal;
+
+	/** tractory params */
+	double sgm_time;
+	int num_dp;
+	int num_df;
+	int num_point;
+	double min_f;
+
+	// weight of cost
+	double w_smooth;
+	double w_collision;
+	double w_vel;
+	double w_acc;
+	double w_goal;
+	double w_long;
+
+	// params of cost
+	double d0;
+	double r;
+	double alpha;
+
+	double v0;
+	double rv;
+	double alphav;
+
+	double a0;
+	double ra;
+	double alphaa;
+
+	/** get distance and gradient in signed distance field ,by position query*/
+	void getDistanceAndGradient(Eigen::Vector3d &pos, double &dist, Eigen::Vector3d &grad) const;
+	void getPositionFromCoeff(Eigen::Vector3d &pos, const Eigen::MatrixXd &coeff, const int &index, const double &time) const;
+	void getVelocityFromCoeff(Eigen::Vector3d &vel, const Eigen::MatrixXd &coeff, const int &index, const double &time) const;
+	void getAccelerationFromCoeff(Eigen::Vector3d &acc, const Eigen::MatrixXd &coeff, const int &index, const double &time) const;
+
+	/** penalty and gradient */
+	void getDistancePenalty(const double &distance, double &cost) const;
+	void getDistancePenaltyGradient(const double &distance, double &grad) const;
+	void getVelocityPenalty(const double &distance, double &cost) const;
+	void getVelocityPenaltyGradient(const double &vel, double &grad) const;
+	void getAccelerationPenalty(const double &distance, double &cost) const;
+	void getAccelerationPenaltyGradient(const double &acc, double &grad) const;
+
+	void getTimeMatrix(const double &t, Eigen::MatrixXd &T) const;
+	void constrains(double &n, double min, double max) const;
+	double getDistanceToBoundary(const double &x, const double &y, const double &z) const;
+	void recaluculateGradient(const double &x, const double &y, const double &z, Eigen ::Vector3d &grad) const;
+};
+
+#endif
--- a/flightlib/include/flightlib/grad_traj_optimization/opt_utile.h
+++ b/flightlib/include/flightlib/grad_traj_optimization/opt_utile.h
@@ -0,0 +1,18 @@
+#ifndef _LATTICE_NODE_H_
+#define _LATTICE_NODE_H_
+#include <Eigen/Eigen>
+
+// body frame
+void getLatticeGuiding(std::vector<std::pair<Eigen::Vector3d, Eigen::Vector3d>> &lattice_nodes, int horizon_num, int vertical_num, int radio_num,
+                       int vel_num, double horizon_fov, double vertical_fov, double radio_range, double vel_fov, double vel_prefile);
+
+void getPositionFromCoeff(Eigen::Vector3d &pos, Eigen::MatrixXd coeff, int index, double time);
+
+void getVelocityFromCoeff(Eigen::Vector3d &vel, Eigen::MatrixXd coeff, int index, double time);
+
+void getAccelerationFromCoeff(Eigen::Vector3d &acc, Eigen::MatrixXd coeff, int index, double time);
+
+Eigen::MatrixXd solveCoeffFromBoundaryState(const Eigen::Vector3d &Pos_init, const Eigen::Vector3d &Vel_init, const Eigen::Vector3d &Acc_init,
+                                            const Eigen::Vector3d &Pos_end, const Eigen::Vector3d &Vel_end, const Eigen::Vector3d &Acc_end,
+                                            double Time);
+#endif
--- a/flightlib/include/flightlib/grad_traj_optimization/qp_generator.h
+++ b/flightlib/include/flightlib/grad_traj_optimization/qp_generator.h
@@ -0,0 +1,53 @@
+#ifndef _TRAJECTORY_GENERATOR_H_
+#define _TRAJECTORY_GENERATOR_H_
+#include <Eigen/Eigen>
+#include <vector>
+
+
+class TrajectoryGenerator {
+   private:
+	int m = 1;           // number of segments in the trajectory
+	Eigen::MatrixXd _A;  // Mapping matrix
+	Eigen::MatrixXd _Q;  // Hessian matrix
+	Eigen::MatrixXd _C;  // Selection matrix
+	Eigen::MatrixXd _L;  // A.inv() * C.transpose()
+
+	Eigen::MatrixXd _R;
+	Eigen::MatrixXd _Rff;
+	Eigen::MatrixXd _Rpp;
+	Eigen::MatrixXd _Rpf;
+	Eigen::MatrixXd _Rfp;
+
+	Eigen::VectorXd _Pxi;
+	Eigen::VectorXd _Pyi;
+	Eigen::VectorXd _Pzi;
+
+	Eigen::VectorXd _Dx;
+	Eigen::VectorXd _Dy;
+	Eigen::VectorXd _Dz;
+
+   public:
+	Eigen::MatrixXd _Path;
+	Eigen::VectorXd _Time;
+
+	TrajectoryGenerator();
+
+	~TrajectoryGenerator();
+
+	void QPGeneration(const Eigen::VectorXd &Time);
+
+	void StackOptiDep();  // Stack the optimization's dependency, the intermediate matrix and initial derivatives
+
+	Eigen::MatrixXd getA();
+	Eigen::MatrixXd getQ();
+	Eigen::MatrixXd getC();
+	Eigen::MatrixXd getL();
+
+	Eigen::MatrixXd getR();
+	Eigen::MatrixXd getRpp();
+	Eigen::MatrixXd getRff();
+	Eigen::MatrixXd getRfp();
+	Eigen::MatrixXd getRpf();
+};
+
+#endif
--- a/flightlib/include/flightlib/grad_traj_optimization/traj_optimization_bridge.h
+++ b/flightlib/include/flightlib/grad_traj_optimization/traj_optimization_bridge.h
@@ -0,0 +1,71 @@
+#ifndef _TRAJ_OPTIMIZATION_BRIDGE_H_
+#define _TRAJ_OPTIMIZATION_BRIDGE_H_
+
+#include <pcl/common/common.h>
+#include <pcl/io/ply_io.h>
+#include <pcl_ros/point_cloud.h>
+#include <sensor_msgs/PointCloud.h>
+#include <stdlib.h>
+#include <yaml-cpp/yaml.h>
+
+#include <Eigen/Eigen>
+
+#include "flightlib/grad_traj_optimization/grad_traj_optimizer.h"
+#include "flightlib/grad_traj_optimization/opt_utile.h"
+
+namespace traj_opt {
+std::shared_ptr<sdf_tools::SignedDistanceField> SdfConstruction(pcl::PointCloud<pcl::PointXYZ>::Ptr cloud, Eigen::Vector3d &map_boundary_min_sdf,
+                                                                Eigen::Vector3d &map_boundary_max_sdf);
+}
+
+class TrajOptimizationBridge {
+   private:
+	YAML::Node cfg_;
+	// state of uav in world frame
+	Eigen::Vector3d pos_;
+	Eigen::Vector3d vel_;
+	Eigen::Vector3d acc_;
+	Eigen::Quaterniond q_wb_;
+	Eigen::Vector3d goal_;
+	Eigen::MatrixXd pred_coeff_;
+
+	// std::string ply_file;
+	double resolution;
+	std::shared_ptr<sdf_tools::SignedDistanceField> sdf_;
+	Eigen::Vector3d map_boundary_min_, map_boundary_max_;
+
+	// x_pva, y_pva, z_pva in world frame
+	std::vector<double> dp_;  // df refers to the initial_state currently
+	std::vector<double> df_;  // dp refers to the end_state currently
+
+	double goal_length;
+	int horizon_num, vertical_num, radio_num, vel_num;
+	double horizon_fov, vertical_fov, radio_range, vel_fov, vel_prefile;
+	std::vector<std::pair<Eigen::Vector3d, Eigen::Vector3d>> lattice_nodes;
+
+	void loadParam(YAML::Node &cfg);
+
+   public:
+	TrajOptimizationBridge();
+	~TrajOptimizationBridge();
+
+	void setMap(std::shared_ptr<sdf_tools::SignedDistanceField> sdf_for_traj_optimization, Eigen::Vector3d &map_boundary_min,
+	            Eigen::Vector3d &map_boundary_max);
+
+	// in world frame
+	void setState(Eigen::Vector3d pos, Eigen::Quaterniond q, Eigen::Vector3d vel, Eigen::Vector3d acc);
+
+	void setGoal(Eigen::Vector3d goal);
+
+	// dp in body frame, grad in body frame
+	void getCostAndGradient(const std::vector<double> &dp, int id, double &cost, std::vector<double> &grad);
+
+	void getNextStateAndCost(const std::vector<double> &dp, double &cost, Eigen::Vector3d &pos, Eigen::Vector3d &vel, Eigen::Vector3d &acc,
+	                         double sim_t);
+
+	void solveBVP(const std::vector<double> &dp);
+
+	void getNextState(Eigen::Vector3d &pos, Eigen::Vector3d &vel, Eigen::Vector3d &acc, double sim_t);
+};
+
+#endif
--- a/flightlib/include/flightlib/json/json.hpp
+++ b/flightlib/include/flightlib/json/json.hpp
--- a/flightlib/include/flightlib/objects/object_base.hpp
+++ b/flightlib/include/flightlib/objects/object_base.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "flightlib/common/types.hpp"
+
+namespace flightlib {
+
+class ObjectBase {
+ public:
+  ObjectBase();
+  virtual ~ObjectBase();
+
+  // reset
+  virtual bool reset(void) = 0;
+
+  // run
+  virtual bool run(const Scalar dt) = 0;
+};
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/objects/quadrotor.hpp
+++ b/flightlib/include/flightlib/objects/quadrotor.hpp
@@ -0,0 +1,110 @@
+#pragma once
+
+/*
+    Explanation:
+        We retain this class but do not use most of its functionalities.
+        For efficiency, we do not use the built-in dynamics model and instead
+        directly use the desired attitude given by the controller as actual state.
+        Because the proposed approach (YOPO) only focuses on the trajectory performance,
+        while control is preformed by an external controller.
+*/
+
+#include <stdlib.h>
+
+#include "flightlib/common/command.hpp"
+#include "flightlib/common/integrator_rk4.hpp"
+#include "flightlib/common/types.hpp"
+#include "flightlib/dynamics/quadrotor_dynamics.hpp"
+#include "flightlib/objects/object_base.hpp"
+#include "flightlib/sensors/imu.hpp"
+#include "flightlib/sensors/rgb_camera.hpp"
+
+namespace flightlib {
+
+class Quadrotor : ObjectBase {
+   public:
+	EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+	Quadrotor(const std::string& cfg_path);
+	Quadrotor(const QuadrotorDynamics& dynamics = QuadrotorDynamics(1.0, 0.25));
+	~Quadrotor();
+
+	// reset
+	bool reset(void) override;
+	bool reset(const QuadState& state);
+	void init(void);
+
+	// run the quadrotor
+	bool setState(const Ref<Vector<>> p, const Ref<Vector<>> v, const Quaternion q_, const Ref<Vector<>> a_, const Scalar ctl_dt);
+	bool run(const Scalar dt) override;
+	bool run(const Command& cmd, const Scalar dt);
+	void runSimpleFlight(const Eigen::Vector3f& ref_acc, float ref_yaw, Eigen::Quaternionf& quat_des);
+
+	// public get functions
+	bool getState(QuadState* const state) const;
+	bool getMotorThrusts(Ref<Vector<4>> motor_thrusts) const;
+	bool getMotorOmega(Ref<Vector<4>> motor_omega) const;
+	bool getDynamics(QuadrotorDynamics* const dynamics) const;
+
+	const QuadrotorDynamics& getDynamics();
+	Vector<3> getSize(void) const;
+	Vector<3> getPosition(void) const;
+	Quaternion getQuaternion(void) const;
+	std::vector<std::shared_ptr<RGBCamera>> getCameras(void) const;
+	bool getCamera(const size_t cam_id, std::shared_ptr<RGBCamera> camera) const;
+	int getNumCamera() const;
+	bool getCollision() const;
+
+	// public set functions
+	bool setState(const QuadState& state);
+	bool setCommand(const Command& cmd);
+	bool updateDynamics(const QuadrotorDynamics& dynamics);
+	bool addRGBCamera(std::shared_ptr<RGBCamera> camera);
+
+	// low-level controller
+	Vector<4> runFlightCtl(const Scalar sim_dt, const Vector<3>& omega, const Command& cmd);
+
+	// simulate motors
+	void runMotors(const Scalar sim_dt, const Vector<4>& motor_thrust_des);
+
+	// constrain world box
+	bool setWorldBox(const Ref<Matrix<3, 2>> box);
+	bool constrainInWorldBox(const QuadState& old_state);
+
+	//
+	inline Scalar getMass(void) { return dynamics_.getMass(); };
+	inline void setSize(const Ref<Vector<3>> size) { size_ = size; };
+	inline void setCollision(const bool collision) { collision_ = collision; };
+
+	float getSimT() { return state_.t; }
+
+   private:
+	// quadrotor dynamics, integrators
+	QuadrotorDynamics dynamics_;
+	IMU imu_;
+	std::unique_ptr<IntegratorRK4> integrator_ptr_;
+	std::vector<std::shared_ptr<RGBCamera>> rgb_cameras_;
+
+	// quad control command
+	Command cmd_;
+
+	// quad state
+	QuadState state_;
+	Vector<3> size_;
+	bool collision_;
+
+	// auxiliar variablers
+	Vector<4> motor_omega_;
+	Vector<4> motor_thrusts_;
+	Matrix<4, 4> B_allocation_;
+	Matrix<4, 4> B_allocation_inv_;
+
+	// P gain for body-rate control
+	const Matrix<3, 3> Kinv_ang_vel_tau_ = Vector<3>(16.6, 16.6, 5.0).asDiagonal();
+	// gravity
+	const Vector<3> gz_{0.0, 0.0, Gz};
+
+	// auxiliary variables
+	Matrix<3, 2> world_box_;
+};
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/objects/static_gate.hpp
+++ b/flightlib/include/flightlib/objects/static_gate.hpp
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "flightlib/objects/static_object.hpp"
+
+namespace flightlib {
+class StaticGate : public StaticObject {
+ public:
+  StaticGate(const std::string& id, const std::string& prefab_id = "rpg_gate")
+    : StaticObject(id, prefab_id) {}
+  ~StaticGate() {}
+};
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/objects/static_object.hpp
+++ b/flightlib/include/flightlib/objects/static_object.hpp
@@ -0,0 +1,36 @@
+#pragma once
+
+#include "flightlib/common/types.hpp"
+
+namespace flightlib {
+class StaticObject {
+ public:
+  StaticObject(const std::string& id, const std::string& prefab_id)
+    : id_(id), prefab_id_(prefab_id){};
+  virtual ~StaticObject(){};
+
+  // public set functions
+  virtual void setPosition(const Vector<3>& position) { position_ = position; };
+  virtual void setQuaternion(const Quaternion& quaternion) {
+    quat_ = quaternion;
+  };
+  virtual void setSize(const Vector<3>& size) { size_ = size; };
+
+  // public get functions
+  virtual Vector<3> getPosition(void) { return position_; };
+  virtual Quaternion getQuaternion(void) { return quat_; };
+  virtual Vector<3> getSize(void) { return size_; };
+  const std::string& getID(void) { return id_; };
+  const std::string& getPrefabID(void) { return prefab_id_; };
+
+ private:
+  std::string id_;
+  std::string prefab_id_;
+
+ protected:
+  Vector<3> position_{0.0, 0.0, 0.0};
+  Quaternion quat_{1.0, 0.0, 0.0, 0.0};
+  Vector<3> size_{1.0, 1.0, 1.0};
+};
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/objects/unity_camera.hpp
+++ b/flightlib/include/flightlib/objects/unity_camera.hpp
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "flightlib/common/types.hpp"
+#include "flightlib/sensors/rgb_camera.hpp"
+
+namespace flightlib {
+
+class UnityCamera : RGBCamera {
+ public:
+  UnityCamera();
+  ~UnityCamera();
+
+ private:
+};
+}  // namespace flightlib
--- a/flightlib/include/flightlib/ros_nodes/flight_pilot.hpp
+++ b/flightlib/include/flightlib/ros_nodes/flight_pilot.hpp
@@ -0,0 +1,92 @@
+
+#pragma once
+
+#include <cv_bridge/cv_bridge.h>
+#include <nav_msgs/Odometry.h>
+#include <ros/ros.h>
+#include <sensor_msgs/CameraInfo.h>
+#include <yaml-cpp/yaml.h>
+
+#include <cmath>
+#include <filesystem>
+#include <memory>
+
+#include "std_msgs/Empty.h"
+// flightlib
+#include "flightlib/bridges/unity_bridge.hpp"
+#include "flightlib/common/quad_state.hpp"
+#include "flightlib/common/types.hpp"
+#include "flightlib/objects/quadrotor.hpp"
+#include "flightlib/sensors/rgb_camera.hpp"
+#include "flightlib/sensors/sgm_gpu/sgm_gpu.h"
+
+using namespace flightlib;
+
+namespace flightros {
+
+class FlightPilot {
+   public:
+	EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+	FlightPilot(const ros::NodeHandle& nh, const ros::NodeHandle& pnh);
+	~FlightPilot();
+
+	// callbacks
+	void mainLoopCallback(const ros::TimerEvent& event);
+	void spawnTreeCallback(const std_msgs::Empty::ConstPtr& msg);
+	void clearTreeCallback(const std_msgs::Empty::ConstPtr& msg);
+	void savePointcloudCallback(const std_msgs::Empty::ConstPtr& msg);
+	void poseCallback(const nav_msgs::Odometry::ConstPtr& msg);
+
+	// unity
+	bool setUnity(const bool render);
+	bool connectUnity(void);
+	bool disconnectUnity();
+	bool loadParams(const YAML::Node& cfg);
+	bool configCamera(const YAML::Node& cfg);
+	void computeDepthImage(const cv::Mat& left_frame, const cv::Mat& right_frame, cv::Mat* const depth);
+	bool spawnTreesAndSavePointCloud();
+
+   private:
+	// ros nodes
+	ros::NodeHandle nh_;
+	ros::NodeHandle pnh_;
+
+	// publisher & subscriber
+	std::string odom_topic_;
+	ros::Publisher stereo_pub, left_img_pub, depth_pub, cam_info_pub;
+	ros::Subscriber state_est_sub_, spawn_tree_sub_, clear_tree_sub_, save_pc_sub_;
+
+	// main loop timer
+	ros::Timer timer_main_loop_;
+	ros::Time timestamp;
+	Scalar main_loop_freq_{50.0};
+
+	// unity & quadrotor
+	Vector<3> quad_size_;
+	QuadState quad_state_;
+	std::shared_ptr<Quadrotor> quad_ptr_;
+	std::shared_ptr<UnityBridge> unity_bridge_ptr_;
+	SceneID scene_id_{UnityScene::WAREHOUSE};
+	bool unity_ready_{false};
+	bool unity_render_{false};
+	RenderMessage_t unity_output_;
+	uint16_t receive_id_{0};
+	FrameID frameID{0};
+
+	// camera param
+	Scalar stereo_baseline_;
+	Scalar fov_;
+	int width_;
+	int height_;
+	bool use_depth, use_stereo;
+	std::shared_ptr<RGBCamera> rgb_camera_left, rgb_camera_right;
+	std::shared_ptr<sgm_gpu::SgmGpu> sgm_;
+
+	// tree generation
+	int ply_id_{0};
+	Scalar avg_tree_spacing_;
+	Vector<3> bounding_box_, bounding_box_origin_;
+	Scalar pointcloud_resolution_;
+	std::string ply_path_;
+};
+}  // namespace flightros
--- a/flightlib/include/flightlib/sensors/imu.hpp
+++ b/flightlib/include/flightlib/sensors/imu.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "flightlib/common/types.hpp"
+#include "flightlib/sensors/sensor_base.hpp"
+
+namespace flightlib {
+
+class IMU : SensorBase {
+ public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+  IMU();
+  ~IMU();
+
+ private:
+};
+}  // namespace flightlib
--- a/flightlib/include/flightlib/sensors/rgb_camera.hpp
+++ b/flightlib/include/flightlib/sensors/rgb_camera.hpp
@@ -0,0 +1,126 @@
+#pragma once
+
+#include <yaml-cpp/yaml.h>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <mutex>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+
+#include "flightlib/common/logger.hpp"
+#include "flightlib/common/types.hpp"
+#include "flightlib/sensors/sensor_base.hpp"
+
+namespace flightlib {
+
+enum CameraLayer { RGB = 0, DepthMap = 1, Segmentation = 2, OpticalFlow = 3 };
+
+namespace RGBCameraTypes {
+  typedef int8_t Intensity_t;
+  typedef cv::Mat Image_t;
+
+  struct RGBImage_t {
+    Image_t image;
+    USecs elapsed_useconds;
+  };
+  struct Depthmap_t {
+    Image_t image;
+    USecs elapsed_useconds;
+  };
+  struct Segement_t {
+    Image_t image;
+    USecs elapsed_useconds;
+  };
+
+  struct OpticFlow_t {
+    Image_t image;
+    USecs elapsed_useconds;
+  };
+
+  typedef Eigen::Matrix4d Mat4_t;
+  typedef Eigen::Vector3d Vec3_t;
+
+  typedef std::function<Eigen::Vector3d()> GetPos_t;
+  typedef std::function<Eigen::Vector3d()> GetVel_t;
+  typedef std::function<Eigen::Vector3d()> GetAcc_t;
+  typedef std::function<Eigen::Quaterniond()> GetQuat_t;
+  typedef std::function<Eigen::Vector3d()> GetOmega_t;
+  typedef std::function<Eigen::Vector3d()> GetPsi_t;
+
+  const std::string RGB = "rgb";
+  // image post processing
+  typedef std::string PostProcessingID;
+  const PostProcessingID Depth = "depth";
+  const PostProcessingID OpticalFlow = "optical_flow";
+  const PostProcessingID ObjectSegment = "object_segment";      // object segmentation
+  const PostProcessingID CategorySegment = "category_segment";  // category segmentation
+}  // namespace RGBCameraTypes
+
+class RGBCamera : SensorBase {
+ public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+  RGBCamera();
+  ~RGBCamera();
+
+  // public set functions
+  bool setRelPose(const Ref<Vector<3>> B_r_BC, const Ref<Matrix<3, 3>> R_BC);
+  bool setWidth(const int width);
+  bool setHeight(const int height);
+  bool setFOV(const Scalar fov);
+  bool setDepthScale(const Scalar depth_scale);
+  bool setPostProcesscing(const std::vector<bool>& enabled_layers);
+  bool feedImageQueue(const int image_layer, const cv::Mat& image_mat);
+  void clearImageQueue();
+
+  // public get functions
+  std::vector<bool> getEnabledLayers(void) const;
+  std::vector<std::string> GetPostProcessing(void);
+  Matrix<4, 4> getRelPose(void) const;
+  int getChannels(void) const;
+  int getWidth(void) const;
+  int getHeight(void) const;
+  Scalar getFOV(void) const;
+  Scalar getDepthScale(void) const;
+  bool getRGBImage(cv::Mat& rgb_img);
+  bool getDepthMap(cv::Mat& depth_map);
+  bool getSegmentation(cv::Mat& segmentation);
+  bool getOpticalFlow(cv::Mat& opticalflow);
+
+  // auxiliary functions
+  void enableDepth(const bool on);
+  void enableOpticalFlow(const bool on);
+  void enableSegmentation(const bool on);
+
+ private:
+  Logger logger_{"RBGCamera"};
+
+  // camera parameters
+  int channels_;
+  int width_;
+  int height_;
+  Scalar fov_;
+  Scalar depth_scale_;
+
+  // Camera relative
+  Vector<3> B_r_BC_;
+  Matrix<4, 4> T_BC_;
+
+  // image data buffer
+  std::mutex queue_mutex_;
+  const int queue_size_ = 0;  // 1
+
+  // TODO：不要用队列，就单纯的Mat就好了，也不会有滞留的问题，也不会有弹空的问题；先不改了省的出错
+  std::deque<cv::Mat> rgb_queue_;
+  std::deque<cv::Mat> depth_queue_;
+  std::deque<cv::Mat> opticalflow_queue_;
+  std::deque<cv::Mat> segmentation_queue_;
+
+  // [rgb, depth, segmentation, optical flow]
+  std::vector<bool> enabled_layers_;
+  // [depth, optical flow, segmentation, segmentation]
+  std::unordered_map<RGBCameraTypes::PostProcessingID, bool> post_processing_;
+};
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/sensors/sensor_base.hpp
+++ b/flightlib/include/flightlib/sensors/sensor_base.hpp
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "flightlib/common/types.hpp"
+
+namespace flightlib {
+class SensorBase {
+ public:
+  SensorBase();
+  virtual ~SensorBase();
+
+ private:
+};
+
+}  // namespace flightlib
--- a/flightlib/include/flightlib/sensors/sgm_gpu/configuration.h
+++ b/flightlib/include/flightlib/sensors/sgm_gpu/configuration.h
@@ -0,0 +1,39 @@
+/***********************************************************************
+  Copyright (C) 2020 Hironori Fujimoto
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+ 
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+***********************************************************************/
+
+#ifndef SGM_GPU__CONFIGURATION_H_
+#define SGM_GPU__CONFIGURATION_H_
+
+#include <stdint.h>
+
+#define	MAX_DISPARITY 128
+#define CENSUS_WIDTH  9
+#define CENSUS_HEIGHT 7
+
+#define TOP  (CENSUS_HEIGHT-1)/2
+#define LEFT (CENSUS_WIDTH-1)/2
+
+namespace sgm_gpu
+{
+
+typedef uint32_t cost_t;
+
+}
+
+#define COSTAGG_BLOCKSIZE       GPU_THREADS_PER_BLOCK
+#define COSTAGG_BLOCKSIZE_HORIZ GPU_THREADS_PER_BLOCK
+
+#endif // SGM_GPU__CONFIGURATION_H_
--- a/flightlib/include/flightlib/sensors/sgm_gpu/cost_aggregation.h
+++ b/flightlib/include/flightlib/sensors/sgm_gpu/cost_aggregation.h
@@ -0,0 +1,511 @@
+/***********************************************************************
+  Copyright (C) 2019 Hironori Fujimoto
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+***********************************************************************/
+#include "flightlib/sensors/sgm_gpu/util.h"
+
+#ifndef COST_AGGREGATION_H_
+#define COST_AGGREGATION_H_
+
+#define ITER_COPY			0
+#define ITER_NORMAL			1
+
+#define MIN_COMPUTE			0
+#define MIN_NOCOMPUTE		1
+
+#define DIR_UPDOWN			0
+#define DIR_DOWNUP			1
+#define DIR_LEFTRIGHT		2
+#define DIR_RIGHTLEFT		3
+
+namespace sgm_gpu
+{
+
+template<int add_col, bool recompute, bool join_dispcomputation>
+__device__ __forceinline__ void CostAggregationGenericIndexesIncrement(int *index, int *index_im, int *col, const int add_index, const int add_imindex) {
+  *index += add_index;
+  if(recompute || join_dispcomputation) {
+    *index_im += add_imindex;
+    if(recompute) {
+      *col += add_col;
+    }
+  }
+}
+
+template<int add_index, bool recompute, bool join_dispcomputation>
+__device__ __forceinline__ void CostAggregationDiagonalGenericIndexesIncrement(int *index, int *index_im, int *col, const int cols, const int initial_row, const int i, const int dis) {
+  *col += add_index;
+  if(add_index > 0 && *col >= cols) {
+    *col = 0;
+  } else if(*col < 0) {
+    *col = cols-1;
+  }
+  *index = abs(initial_row-i)*cols*MAX_DISPARITY+*col*MAX_DISPARITY+dis;
+  if(recompute || join_dispcomputation) {
+    *index_im = abs(initial_row-i)*cols+*col;
+  }
+}
+
+template<class T, int iter_type, int min_type, int dir_type, bool first_iteration, bool recompute, bool join_dispcomputation>
+__device__ __forceinline__ void CostAggregationGenericIteration(int index, int index_im, int col, uint32_t *old_values, int *old_value1, int *old_value2, int *old_value3, int *old_value4, uint32_t *min_cost, uint32_t *min_cost_p2, uint8_t* d_cost, uint8_t *d_L, uint16_t *d_s, const int p1_vector, const int p2_vector, const T *_d_transform0, const T *_d_transform1, const int lane, const int MAX_PAD, const int dis, T *rp0, T *rp1, T *rp2, T *rp3, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
+  const T __restrict__ *d_transform0 = _d_transform0;
+  const T __restrict__ *d_transform1 = _d_transform1;
+  uint32_t costs, next_dis, prev_dis;
+
+  if(iter_type == ITER_NORMAL) {
+    // First shuffle
+    int prev_dis1 = shfl_up_32(*old_value4, 1);
+    if(lane == 0) {
+      prev_dis1 = MAX_PAD;
+    }
+
+    // Second shuffle
+    int next_dis4 = shfl_down_32(*old_value1, 1);
+    if(lane == 31) {
+      next_dis4 = MAX_PAD;
+    }
+
+    // Shift + rotate
+    //next_dis = __funnelshift_r(next_dis4, *old_values, 8);
+    next_dis = __byte_perm(*old_values, next_dis4, 0x4321);
+    prev_dis = __byte_perm(*old_values, prev_dis1, 0x2104);
+
+    next_dis = next_dis + p1_vector;
+    prev_dis = prev_dis + p1_vector;
+  }
+  if(recompute) {
+    const int dif = col - dis;
+    if(dir_type == DIR_LEFTRIGHT) {
+      if(lane == 0) {
+        // lane = 0 is dis = 0, no need to subtract dis
+        *rp0 = d_transform1[index_im];
+      }
+    } else if(dir_type == DIR_RIGHTLEFT) {
+      // First iteration, load D pixels
+      if(first_iteration) {
+        const uint4 right = reinterpret_cast<const uint4*>(&d_transform1[index_im-dis-3])[0];
+        *rp3 = right.x;
+        *rp2 = right.y;
+        *rp1 = right.z;
+        *rp0 = right.w;
+      } else if(lane == 31 && dif >= 3) {
+        *rp3 = d_transform1[index_im-dis-3];
+      }
+    } else {
+  /*
+      __shared__ T right_p[MAX_DISPARITY+32];
+      const int warp_id = threadIdx.x / WARP_SIZE;
+      if(warp_id < 5) {
+        const int block_imindex = index_im - warp_id + 32;
+        const int rp_index = warp_id*WARP_SIZE+lane;
+        const int col_cpy = col-warp_id+32;
+        right_p[rp_index] = ((col_cpy-(159-rp_index)) >= 0) ? ld_gbl_cs(&d_transform1[block_imindex-(159-rp_index)]) : 0;
+      }*/
+
+      __shared__ T right_p[128+32];
+      const int warp_id = threadIdx.x / WARP_SIZE;
+      const int block_imindex = index_im - warp_id + 2;
+      const int rp_index = warp_id*WARP_SIZE+lane;
+      const int col_cpy = col-warp_id+2;
+      right_p[rp_index] = ((col_cpy-(129-rp_index)) >= 0) ? d_transform1[block_imindex-(129-rp_index)] : 0;
+      right_p[rp_index+64] = ((col_cpy-(129-rp_index-64)) >= 0) ? d_transform1[block_imindex-(129-rp_index-64)] : 0;
+      //right_p[rp_index+128] = ld_gbl_cs(&d_transform1[block_imindex-(129-rp_index-128)]);
+      if(warp_id == 0) {
+        right_p[128+lane] = ld_gbl_cs(&d_transform1[block_imindex-(129-lane)]);
+      }
+      __syncthreads();
+
+      const int px = MAX_DISPARITY+warp_id-dis-1;
+      *rp0 = right_p[px];
+      *rp1 = right_p[px-1];
+      *rp2 = right_p[px-2];
+      *rp3 = right_p[px-3];
+    }
+    const T left_pixel = d_transform0[index_im];
+    *old_value1 = popcount(left_pixel ^ *rp0);
+    *old_value2 = popcount(left_pixel ^ *rp1);
+    *old_value3 = popcount(left_pixel ^ *rp2);
+    *old_value4 = popcount(left_pixel ^ *rp3);
+    if(iter_type == ITER_COPY) {
+      *old_values = uchars_to_uint32(*old_value1, *old_value2, *old_value3, *old_value4);
+    } else {
+      costs = uchars_to_uint32(*old_value1, *old_value2, *old_value3, *old_value4);
+    }
+    // Prepare for next iteration
+    if(dir_type == DIR_LEFTRIGHT) {
+      *rp3 = shfl_up_32(*rp3, 1);
+    } else if(dir_type == DIR_RIGHTLEFT) {
+      *rp0 = shfl_down_32(*rp0, 1);
+    }
+  } else {
+    if(iter_type == ITER_COPY) {
+      *old_values = ld_gbl_ca(reinterpret_cast<const uint32_t*>(&d_cost[index]));
+    } else {
+      costs = ld_gbl_ca(reinterpret_cast<const uint32_t*>(&d_cost[index]));
+    }
+  }
+
+  if(iter_type == ITER_NORMAL) {
+    const uint32_t min1 = __vminu4(*old_values, prev_dis);
+    const uint32_t min2 = __vminu4(next_dis, *min_cost_p2);
+    const uint32_t min_prev = __vminu4(min1, min2);
+    *old_values = costs + (min_prev - *min_cost);
+  }
+  if(iter_type == ITER_NORMAL || !recompute) {
+    uint32_to_uchars(*old_values, old_value1, old_value2, old_value3, old_value4);
+  }
+
+  if(join_dispcomputation) {
+    const uint32_t L0_costs = *((uint32_t*) (d_L0+index));
+    const uint32_t L1_costs = *((uint32_t*) (d_L1+index));
+    const uint32_t L2_costs = *((uint32_t*) (d_L2+index));
+    const uint32_t L3_costs = *((uint32_t*) (d_L3+index));
+    const uint32_t L4_costs = *((uint32_t*) (d_L4+index));
+    const uint32_t L5_costs = *((uint32_t*) (d_L5+index));
+    const uint32_t L6_costs = *((uint32_t*) (d_L6+index));
+
+    int l0_x, l0_y, l0_z, l0_w;
+    int l1_x, l1_y, l1_z, l1_w;
+    int l2_x, l2_y, l2_z, l2_w;
+    int l3_x, l3_y, l3_z, l3_w;
+    int l4_x, l4_y, l4_z, l4_w;
+    int l5_x, l5_y, l5_z, l5_w;
+    int l6_x, l6_y, l6_z, l6_w;
+
+    uint32_to_uchars(L0_costs, &l0_x, &l0_y, &l0_z, &l0_w);
+    uint32_to_uchars(L1_costs, &l1_x, &l1_y, &l1_z, &l1_w);
+    uint32_to_uchars(L2_costs, &l2_x, &l2_y, &l2_z, &l2_w);
+    uint32_to_uchars(L3_costs, &l3_x, &l3_y, &l3_z, &l3_w);
+    uint32_to_uchars(L4_costs, &l4_x, &l4_y, &l4_z, &l4_w);
+    uint32_to_uchars(L5_costs, &l5_x, &l5_y, &l5_z, &l5_w);
+    uint32_to_uchars(L6_costs, &l6_x, &l6_y, &l6_z, &l6_w);
+
+    const uint16_t val1 = l0_x + l1_x + l2_x + l3_x + l4_x + l5_x + l6_x + *old_value1;
+    const uint16_t val2 = l0_y + l1_y + l2_y + l3_y + l4_y + l5_y + l6_y + *old_value2;
+    const uint16_t val3 = l0_z + l1_z + l2_z + l3_z + l4_z + l5_z + l6_z + *old_value3;
+    const uint16_t val4 = l0_w + l1_w + l2_w + l3_w + l4_w + l5_w + l6_w + *old_value4;
+
+    int min_idx1 = dis;
+    uint16_t min1 = val1;
+    if(val1 > val2) {
+      min1 = val2;
+      min_idx1 = dis+1;
+    }
+
+    int min_idx2 = dis+2;
+    uint16_t min2 = val3;
+    if(val3 > val4) {
+      min2 = val4;
+      min_idx2 = dis+3;
+    }
+
+    uint16_t minval = min1;
+    int min_idx = min_idx1;
+    if(min1 > min2) {
+      minval = min2;
+      min_idx = min_idx2;
+    }
+
+    const int min_warpindex = warpReduceMinIndex(minval, min_idx);
+    if(lane == 0) {
+      d_disparity[index_im] = min_warpindex;
+    }
+    
+    // Save smoothed cost to obtain right disparity
+    d_s[index] = val1;
+    d_s[index+1] = val2;
+    d_s[index+2] = val3;
+    d_s[index+3] = val4;
+  } else {
+    st_gbl_cs(reinterpret_cast<uint32_t*>(&d_L[index]), *old_values);
+  }
+  if(min_type == MIN_COMPUTE) {
+    int min_cost_scalar = min(min(*old_value1, *old_value2), min(*old_value3, *old_value4));
+    *min_cost = uchar_to_uint32(warpReduceMin(min_cost_scalar));
+    *min_cost_p2 = *min_cost + p2_vector;
+  }
+}
+
+template<class T, int add_col, int dir_type, bool recompute, bool join_dispcomputation>
+__device__ __forceinline__ void CostAggregationGeneric(uint8_t* d_cost, uint8_t *d_L, uint16_t *d_s, const int P1, const int P2, const int initial_row, const int initial_col, const int max_iter, const int cols, int add_index, const T *_d_transform0, const T *_d_transform1, const int add_imindex, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
+  const int lane = threadIdx.x % WARP_SIZE;
+  const int dis = 4*lane;
+  int index = initial_row*cols*MAX_DISPARITY+initial_col*MAX_DISPARITY+dis;
+  int col, index_im;
+  if(recompute || join_dispcomputation) {
+    if(recompute) {
+      col = initial_col;
+    }
+    index_im = initial_row*cols+initial_col;
+  }
+
+  const int MAX_PAD = UCHAR_MAX-P1;
+  const uint32_t p1_vector = uchars_to_uint32(P1, P1, P1, P1);
+  const uint32_t p2_vector = uchars_to_uint32(P2, P2, P2, P2);
+  int old_value1;
+  int old_value2;
+  int old_value3;
+  int old_value4;
+  uint32_t min_cost, min_cost_p2, old_values;
+  T rp0, rp1, rp2, rp3;
+
+  if(recompute) {
+    if(dir_type == DIR_LEFTRIGHT) {
+      CostAggregationGenericIteration<T, ITER_COPY, MIN_COMPUTE, dir_type, true, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+      CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+      CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp3, &rp0, &rp1, &rp2, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+      CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+      CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp2, &rp3, &rp0, &rp1, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+      CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+      CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp1, &rp2, &rp3, &rp0, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+      for(int i = 4; i < max_iter-3; i+=4) {
+        CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+        CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+        CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+        CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp3, &rp0, &rp1, &rp2, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+        CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+        CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp2, &rp3, &rp0, &rp1, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+        CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+        CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp1, &rp2, &rp3, &rp0, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+      }
+      CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+      CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+      CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+      CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp3, &rp0, &rp1, &rp2, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+      CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+      CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp2, &rp3, &rp0, &rp1, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+      CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+      CostAggregationGenericIteration<T, ITER_NORMAL, MIN_NOCOMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp1, &rp2, &rp3, &rp0, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+    } else if(dir_type == DIR_RIGHTLEFT) {
+      CostAggregationGenericIteration<T, ITER_COPY, MIN_COMPUTE, dir_type, true, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+      CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+      CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp1, &rp2, &rp3, &rp0, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+      CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+      CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp2, &rp3, &rp0, &rp1, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+      CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+      CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp3, &rp0, &rp1, &rp2, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+      for(int i = 4; i < max_iter-3; i+=4) {
+        CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+        CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+        CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+        CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp1, &rp2, &rp3, &rp0, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+        CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+        CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp2, &rp3, &rp0, &rp1, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+        CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+        CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp3, &rp0, &rp1, &rp2, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+      }
+      CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+      CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+      CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+      CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp1, &rp2, &rp3, &rp0, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+      CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+      CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp2, &rp3, &rp0, &rp1, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+      CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+      CostAggregationGenericIteration<T, ITER_NORMAL, MIN_NOCOMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp3, &rp0, &rp1, &rp2, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+    } else {
+      CostAggregationGenericIteration<T, ITER_COPY, MIN_COMPUTE, dir_type, true, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+      for(int i = 1; i < max_iter; i++) {
+        CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+        CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+      }
+      CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+      CostAggregationGenericIteration<T, ITER_NORMAL, MIN_NOCOMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+    }
+  } else {
+    CostAggregationGenericIteration<T, ITER_COPY, MIN_COMPUTE, dir_type, true, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+
+    for(int i = 1; i < max_iter; i++) {
+      CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+      CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+    }
+    CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
+    CostAggregationGenericIteration<T, ITER_NORMAL, MIN_NOCOMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+  }
+}
+
+template<int add_index, class T, int dir_type, bool recompute, bool join_dispcomputation>
+__device__ __forceinline__ void CostAggregationDiagonalGeneric(uint8_t* d_cost, uint8_t *d_L, uint16_t *d_s, const int P1, const int P2, const int initial_row, const int initial_col, const int max_iter, const int col_nomin, const int col_copycost, const int cols, const T *_d_transform0, const T *_d_transform1, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
+  const int lane = threadIdx.x % WARP_SIZE;
+  const int dis = 4*lane;
+  int col = initial_col;
+  int index = initial_row*cols*MAX_DISPARITY+initial_col*MAX_DISPARITY+dis;
+  int index_im;
+  if(recompute || join_dispcomputation) {
+    index_im = initial_row*cols+col;
+  }
+  const int MAX_PAD = UCHAR_MAX-P1;
+  const uint32_t p1_vector = uchars_to_uint32(P1, P1, P1, P1);
+  const uint32_t p2_vector = uchars_to_uint32(P2, P2, P2, P2);
+  int old_value1;
+  int old_value2;
+  int old_value3;
+  int old_value4;
+  uint32_t min_cost, min_cost_p2, old_values;
+  T rp0, rp1, rp2, rp3;
+
+  CostAggregationGenericIteration<T, ITER_COPY, MIN_COMPUTE, dir_type, true, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+  for(int i = 1; i < max_iter; i++) {
+    CostAggregationDiagonalGenericIndexesIncrement<add_index, recompute, join_dispcomputation>(&index, &index_im, &col, cols, initial_row, i, dis);
+    if(col == col_copycost) {
+      CostAggregationGenericIteration<T, ITER_COPY, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+    } else {
+      CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+    }
+  }
+
+  CostAggregationDiagonalGenericIndexesIncrement<add_index, recompute, join_dispcomputation>(&index, &index_im, &col, cols, max_iter, initial_row, dis);
+  if(col == col_copycost) {
+    CostAggregationGenericIteration<T, ITER_COPY, MIN_NOCOMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+  } else {
+    CostAggregationGenericIteration<T, ITER_NORMAL, MIN_NOCOMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+  }
+}
+
+template<class T>
+
+__global__ void CostAggregationKernelDiagonalDownUpRightLeft(uint8_t* d_cost, uint8_t *d_L, uint16_t *d_s, const int P1, const int P2, const int rows, const int cols, const T *d_transform0, const T *d_transform1, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
+  const int initial_col = cols - (blockIdx.x*(blockDim.x/WARP_SIZE) + (threadIdx.x / WARP_SIZE)) - 1;
+  if(initial_col < cols) {
+    const int initial_row = rows-1;
+    const int add_index = -1;
+    const int col_nomin = 0;
+    const int col_copycost = cols-1;
+    const int max_iter = rows-1;
+    const bool recompute = false;
+    const bool join_dispcomputation = false;
+
+    CostAggregationDiagonalGeneric<add_index, T, DIR_DOWNUP, recompute, join_dispcomputation>(d_cost, d_L, d_s, P1, P2, initial_row, initial_col, max_iter, col_nomin, col_copycost, cols, d_transform0, d_transform1, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+  }
+}
+
+template<class T>
+__global__ void CostAggregationKernelDiagonalDownUpLeftRight(uint8_t* d_cost, uint8_t *d_L, uint16_t *d_s, const int P1, const int P2, const int rows, const int cols, const T *d_transform0, const T *d_transform1, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
+  const int initial_col = cols - (blockIdx.x*(blockDim.x/WARP_SIZE) + (threadIdx.x / WARP_SIZE)) - 1;
+  if(initial_col >= 0) {
+    const int initial_row = rows-1;
+    const int add_index = 1;
+    const int col_nomin = cols-1;
+    const int col_copycost = 0;
+    const int max_iter = rows-1;
+    const bool recompute = false;
+    const bool join_dispcomputation = false;
+
+    CostAggregationDiagonalGeneric<add_index, T, DIR_DOWNUP, recompute, join_dispcomputation>(d_cost, d_L, d_s, P1, P2, initial_row, initial_col, max_iter, col_nomin, col_copycost, cols, d_transform0, d_transform1, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+  }
+}
+
+template<class T>
+
+__global__ void CostAggregationKernelDiagonalUpDownRightLeft(uint8_t* d_cost, uint8_t *d_L, uint16_t *d_s, const int P1, const int P2, const int rows, const int cols, const T *d_transform0, const T *d_transform1, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
+  const int initial_col = blockIdx.x*(blockDim.x/WARP_SIZE) + (threadIdx.x / WARP_SIZE);
+  if(initial_col < cols) {
+    const int initial_row = 0;
+    const int add_index = -1;
+    const int col_nomin = 0;
+    const int col_copycost = cols-1;
+    const int max_iter = rows-1;
+    const bool recompute = false;
+    const bool join_dispcomputation = true;
+
+    CostAggregationDiagonalGeneric<add_index, T, DIR_UPDOWN, recompute, join_dispcomputation>(d_cost, d_L, d_s, P1, P2, initial_row, initial_col, max_iter, col_nomin, col_copycost, cols, d_transform0, d_transform1, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+  }
+}
+
+template<class T>
+
+__global__ void CostAggregationKernelDiagonalUpDownLeftRight(uint8_t* d_cost, uint8_t *d_L, uint16_t *d_s, const int P1, const int P2, const int rows, const int cols, const T *d_transform0, const T *d_transform1, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
+  const int initial_col = blockIdx.x*(blockDim.x/WARP_SIZE) + (threadIdx.x / WARP_SIZE);
+  if(initial_col < cols) {
+    const int initial_row = 0;
+    const int add_index = 1;
+    const int col_nomin = cols-1;
+    const int col_copycost = 0;
+    const int max_iter = rows-1;
+    const bool recompute = false;
+    const bool join_dispcomputation = false;
+
+    CostAggregationDiagonalGeneric<add_index, T, DIR_UPDOWN, recompute, join_dispcomputation>(d_cost, d_L, d_s, P1, P2, initial_row, initial_col, max_iter, col_nomin, col_copycost, cols, d_transform0, d_transform1, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+  }
+}
+
+template<class T>
+
+__global__ void CostAggregationKernelLeftToRight(uint8_t* d_cost, uint8_t *d_L, uint16_t *d_s, const int P1, const int P2, const int rows, const int cols, const T *d_transform0, const T *d_transform1, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
+  const int initial_row = blockIdx.x*(blockDim.x/WARP_SIZE) + (threadIdx.x / WARP_SIZE);
+  if(initial_row < rows) {
+    const int initial_col = 0;
+    const int add_index = MAX_DISPARITY;
+    const int add_imindex = 1;
+    const int max_iter = cols-1;
+    const int add_col = 1;
+    const bool recompute = true;
+    const bool join_dispcomputation = false;
+
+    CostAggregationGeneric<T, add_col, DIR_LEFTRIGHT, recompute, join_dispcomputation>(d_cost, d_L, d_s, P1, P2, initial_row, initial_col, max_iter, cols, add_index, d_transform0, d_transform1, add_imindex, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+  }
+}
+
+template<class T>
+
+__global__ void CostAggregationKernelRightToLeft(uint8_t* d_cost, uint8_t *d_L, uint16_t *d_s, const int P1, const int P2, const int rows, const int cols, const T *d_transform0, const T *d_transform1, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
+  const int initial_row = blockIdx.x*(blockDim.x/WARP_SIZE) + (threadIdx.x / WARP_SIZE);
+  if(initial_row < rows) {
+    const int initial_col = cols-1;
+    const int add_index = -MAX_DISPARITY;
+    const int add_imindex = -1;
+    const int max_iter = cols-1;
+    const int add_col = -1;
+    const bool recompute = true;
+    const bool join_dispcomputation = false;
+
+    CostAggregationGeneric<T, add_col, DIR_RIGHTLEFT, recompute, join_dispcomputation>(d_cost, d_L, d_s, P1, P2, initial_row, initial_col, max_iter, cols, add_index, d_transform0, d_transform1, add_imindex, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+  }
+}
+
+template<class T>
+__global__ void CostAggregationKernelDownToUp(uint8_t* d_cost, uint8_t *d_L, uint16_t *d_s, const int P1, const int P2, const int rows, const int cols, const T *d_transform0, const T *d_transform1, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
+  const int initial_col = blockIdx.x*(blockDim.x/WARP_SIZE) + (threadIdx.x / WARP_SIZE);
+  if(initial_col < cols) {
+    const int initial_row = rows-1;
+    const int add_index = -cols*MAX_DISPARITY;
+    const int add_imindex = -cols;
+    const int max_iter = rows-1;
+    const int add_col = 0;
+    const bool recompute = false;
+    const bool join_dispcomputation = false;
+
+    CostAggregationGeneric<T, add_col, DIR_DOWNUP, recompute, join_dispcomputation>(d_cost, d_L, d_s, P1, P2, initial_row, initial_col, max_iter, cols, add_index, d_transform0, d_transform1, add_imindex, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+  }
+}
+
+template<class T>
+//__launch_bounds__(64, 16)
+__global__ void CostAggregationKernelUpToDown(uint8_t* d_cost, uint8_t *d_L, uint16_t* d_s, const int P1, const int P2, const int rows, const int cols, const T *d_transform0, const T *d_transform1, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
+  const int initial_col = blockIdx.x*(blockDim.x/WARP_SIZE) + (threadIdx.x / WARP_SIZE);
+  if(initial_col < cols) {
+    const int initial_row = 0;
+    const int add_index = cols*MAX_DISPARITY;
+    const int add_imindex = cols;
+    const int max_iter = rows-1;
+    const int add_col = 0;
+    const bool recompute = false;
+    const bool join_dispcomputation = false;
+
+    CostAggregationGeneric<T, add_col, DIR_UPDOWN, recompute, join_dispcomputation>(d_cost, d_L, d_s, P1, P2, initial_row, initial_col, max_iter, cols, add_index, d_transform0, d_transform1, add_imindex, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
+  }
+}
+
+} // namespace sgm_gpu
+
+#endif /* COST_AGGREGATION_H_ */
--- a/flightlib/include/flightlib/sensors/sgm_gpu/costs.h
+++ b/flightlib/include/flightlib/sensors/sgm_gpu/costs.h
@@ -0,0 +1,30 @@
+/***********************************************************************
+  Copyright (C) 2020 Hironori Fujimoto
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+ 
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+***********************************************************************/
+#ifndef SGM_GPU__COSTS_H_
+#define SGM_GPU__COSTS_H_
+
+#include <stdint.h>
+#include "flightlib/sensors/sgm_gpu/configuration.h"
+
+namespace sgm_gpu
+{
+
+__global__ void CenterSymmetricCensusKernelSM2(const uint8_t *im, const uint8_t *im2, cost_t *transform, cost_t *transform2, const uint32_t rows, const uint32_t cols);
+
+}
+
+#endif // SGM_GPU__COSTS_H_
+
--- a/flightlib/include/flightlib/sensors/sgm_gpu/hamming_cost.h
+++ b/flightlib/include/flightlib/sensors/sgm_gpu/hamming_cost.h
@@ -0,0 +1,33 @@
+/***********************************************************************
+  Copyright (C) 2020 Hironori Fujimoto
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+***********************************************************************/
+
+#ifndef SGM_GPU__HAMMING_COST_H_
+#define SGM_GPU__HAMMING_COST_H_
+
+#include "flightlib/sensors/sgm_gpu/configuration.h"
+#include "flightlib/sensors/sgm_gpu/util.h"
+#include <stdint.h>
+
+namespace sgm_gpu
+{
+
+__global__ void
+HammingDistanceCostKernel(const cost_t *d_transform0, const cost_t *d_transform1, uint8_t *d_cost, const int rows, const int cols );
+
+}
+
+#endif // SGM_GPU__HAMMING_COST_H_ 
+
--- a/flightlib/include/flightlib/sensors/sgm_gpu/left_right_consistency.h
+++ b/flightlib/include/flightlib/sensors/sgm_gpu/left_right_consistency.h
@@ -0,0 +1,31 @@
+/***********************************************************************
+  Copyright (C) 2020 Hironori Fujimoto
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+ 
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+***********************************************************************/
+
+#ifndef SGM_GPU__LEFT_RIGHT_CONSISTENCY_H_
+#define SGM_GPU__LEFT_RIGHT_CONSISTENCY_H_
+
+#include <stdint.h>
+
+namespace sgm_gpu
+{
+
+__global__ void ChooseRightDisparity(uint8_t *right_disparity, const uint16_t *smoothed_cost, const uint32_t rows, const uint32_t cols);
+__global__ void LeftRightConsistencyCheck(uint8_t *disparity, const uint8_t *disparity_right, const uint32_t rows, const uint32_t cols);
+
+}
+
+#endif // SGM_GPU__LEFT_RIGHT_CONSISTENCY_H_
+
--- a/flightlib/include/flightlib/sensors/sgm_gpu/median_filter.h
+++ b/flightlib/include/flightlib/sensors/sgm_gpu/median_filter.h
@@ -0,0 +1,62 @@
+/***********************************************************************
+  Copyright (C) 2020 Hironori Fujimoto
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+***********************************************************************/
+
+#ifndef SGM_GPU__MEDIAN_FILTER_H_
+#define SGM_GPU__MEDIAN_FILTER_H_
+
+#include <stdint.h>
+
+namespace sgm_gpu
+{
+
+__global__ void MedianFilter3x3(const uint8_t* __restrict__ d_input, uint8_t* __restrict__ d_out, const uint32_t rows, const uint32_t cols);
+
+template<int n, typename T>
+__inline__ __device__ void MedianFilter(const T* __restrict__ d_input, T* __restrict__ d_out, const uint32_t rows, const uint32_t cols) {
+  const uint32_t idx = blockIdx.x*blockDim.x+threadIdx.x;
+  const uint32_t row = idx / cols;
+  const uint32_t col = idx % cols;
+  T window[n*n];
+  int half = n/2;
+
+  if(row >= half && col >= half && row < rows-half && col < cols-half) {
+    for(uint32_t i = 0; i < n; i++) {
+      for(uint32_t j = 0; j < n; j++) {
+        window[i*n+j] = d_input[(row-half+i)*cols+col-half+j];
+      }
+    }
+
+    for(uint32_t i = 0; i < (n*n/2)+1; i++) {
+      uint32_t min_idx = i;
+      for(uint32_t j = i+1; j < n*n; j++) {
+        if(window[j] < window[min_idx]) {
+          min_idx = j;
+        }
+      }
+      const T tmp = window[i];
+      window[i] = window[min_idx];
+      window[min_idx] = tmp;
+    }
+    d_out[idx] = window[n*n/2];
+  } else if(row < rows && col < cols) {
+    d_out[idx] = d_input[idx];
+  }
+}
+
+} // namespace sgm_gpu
+
+#endif // SGM_GPU__MEDIAN_FILTER_H_
+
--- a/flightlib/include/flightlib/sensors/sgm_gpu/sgm_gpu.h
+++ b/flightlib/include/flightlib/sensors/sgm_gpu/sgm_gpu.h
@@ -0,0 +1,90 @@
+/***********************************************************************
+  Copyright (C) 2020 Hironori Fujimoto
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+***********************************************************************/
+#ifndef SGM_GPU__SGM_GPU_H_
+#define SGM_GPU__SGM_GPU_H_
+
+#include "flightlib/sensors/sgm_gpu/configuration.h"
+
+#include <opencv2/opencv.hpp>
+#include <opencv2/imgproc/types_c.h>
+
+namespace sgm_gpu {
+
+class SgmGpu {
+ private:
+  /**
+   * @brief Parameter used in SGM algorithm
+   *
+   * See SGM paper.
+   */
+  uint8_t p1_, p2_;
+  /**
+   * @brief Enable/disable left-right consistency check
+   */
+  bool check_consistency_;
+
+  // Memory for disparity computation
+  // d_: for device
+  uint8_t *d_im0_;
+  uint8_t *d_im1_;
+  uint32_t *d_transform0_;
+  uint32_t *d_transform1_;
+  uint8_t *d_cost_;
+  uint8_t *d_disparity_;
+  uint8_t *d_disparity_filtered_uchar_;
+  uint8_t *d_disparity_right_;
+  uint8_t *d_disparity_right_filtered_uchar_;
+  uint8_t *d_L0_;
+  uint8_t *d_L1_;
+  uint8_t *d_L2_;
+  uint8_t *d_L3_;
+  uint8_t *d_L4_;
+  uint8_t *d_L5_;
+  uint8_t *d_L6_;
+  uint8_t *d_L7_;
+  uint16_t *d_s_;
+
+  bool memory_allocated_;
+
+  uint32_t cols_, rows_;
+
+  void allocateMemory(uint32_t cols, uint32_t rows);
+  void freeMemory();
+
+  /**
+   * @brief Resize images to be width and height divisible by 4 for limit of
+   * CUDA code
+   */
+  void resizeToDivisibleBy4(cv::Mat &left_image, cv::Mat &right_image);
+
+//  void convertToMsg(const cv::Mat_<unsigned char> &disparity,
+//                    const sensor_msgs::CameraInfo &left_camera_info,
+//                    const sensor_msgs::CameraInfo &right_camera_info,
+//                    stereo_msgs::DisparityImage &disparity_msg);
+
+ public:
+  /**
+   * @brief Constructor which use namespace <parent>/libsgm_gpu for ROS param
+   */
+  SgmGpu(const int cols, const int rows);
+  ~SgmGpu();
+
+  bool computeDisparity(const cv::Mat &left_image, const cv::Mat &right_image, cv::Mat &disparity_out);
+};
+
+}  // namespace sgm_gpu
+
+#endif  // SGM_GPU__SGM_GPU_H_
--- a/flightlib/include/flightlib/sensors/sgm_gpu/util.h
+++ b/flightlib/include/flightlib/sensors/sgm_gpu/util.h
@@ -0,0 +1,362 @@
+/***********************************************************************
+  Copyright (C) 2020 Hironori Fujimoto
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+***********************************************************************/
+
+#ifndef SGM_GPU__UTIL_H_
+#define SGM_GPU__UTIL_H_
+
+#include <iostream>
+#include <dirent.h>
+#include <stdio.h>
+
+#define FERMI false
+
+#define GPU_THREADS_PER_BLOCK_FERMI 256
+#define GPU_THREADS_PER_BLOCK_MAXWELL 64
+
+/* Defines related to GPU Architecture */
+#if FERMI
+  #define GPU_THREADS_PER_BLOCK   GPU_THREADS_PER_BLOCK_FERMI
+#else
+  #define GPU_THREADS_PER_BLOCK   GPU_THREADS_PER_BLOCK_MAXWELL
+#endif
+
+#define WARP_SIZE		32
+
+namespace sgm_gpu
+{
+
+static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
+#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)
+
+/**
+* Check the return value of the CUDA runtime API call and exit
+* the application if the call has failed.
+*/
+static void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err) {
+  if (err == cudaSuccess)
+    return;
+  std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
+  exit (1);
+}
+
+/*************************************
+GPU Side defines (ASM instructions)
+**************************************/
+
+// output temporal carry in internal register
+#define UADD__CARRY_OUT(c, a, b) \
+  asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b))
+
+// add & output with temporal carry of internal register
+#define UADD__IN_CARRY_OUT(c, a, b) \
+  asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b))
+
+// add with temporal carry of internal register
+#define UADD__IN_CARRY(c, a, b) \
+  asm volatile("addc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b))
+
+// packing and unpacking: from uint64_t to uint2
+#define V2S_B64(v,s) \
+  asm("mov.b64 %0, {%1,%2};" : "=l"(s) : "r"(v.x), "r"(v.y))
+
+// packing and unpacking: from uint2 to uint64_t
+#define S2V_B64(s,v) \
+  asm("mov.b64 {%0,%1}, %2;" : "=r"(v.x), "=r"(v.y) : "l"(s))
+
+
+/*************************************
+DEVICE side basic block primitives
+**************************************/
+
+#if FERMI
+  #define LDG(ptr)  (* ptr)
+#else
+  #define LDG(ptr)  __ldg(ptr)
+#endif
+
+#if FERMI
+__shared__ int interBuff[GPU_THREADS_PER_BLOCK];
+__inline__ __device__ int __emulated_shfl(const int scalarValue, const uint32_t source_lane) {
+  const int warpIdx = threadIdx.x / WARP_SIZE;
+  const int laneIdx = threadIdx.x % WARP_SIZE;
+  volatile int *interShuffle = interBuff + (warpIdx * WARP_SIZE);
+  interShuffle[laneIdx] = scalarValue;
+  return(interShuffle[source_lane % WARP_SIZE]);
+}
+#endif
+
+__inline__ __device__ int shfl_32(int scalarValue, const int lane) {
+  #if FERMI
+    return __emulated_shfl(scalarValue, (uint32_t)lane);
+  #else
+    return __shfl_sync(0xffffffff, scalarValue, lane);
+  #endif
+}
+
+__inline__ __device__ int shfl_up_32(int scalarValue, const int n) {
+  #if FERMI
+    int lane = threadIdx.x % WARP_SIZE;
+    lane -= n;
+    return shfl_32(scalarValue, lane);
+  #else
+    return __shfl_up_sync(0xffffffff, scalarValue, n);
+  #endif
+}
+
+__inline__ __device__ int shfl_down_32(int scalarValue, const int n) {
+  #if FERMI
+    int lane = threadIdx.x % WARP_SIZE;
+    lane += n;
+    return shfl_32(scalarValue, lane);
+  #else
+    return __shfl_down_sync(0xffffffff, scalarValue, n);
+  #endif
+}
+
+__inline__ __device__ int shfl_xor_32(int scalarValue, const int n) {
+  #if FERMI
+    int lane = threadIdx.x % WARP_SIZE;
+    lane = lane ^ n;
+    return shfl_32(scalarValue, lane);
+  #else
+    return __shfl_xor_sync(0xffffffff, scalarValue, n);
+  #endif
+}
+
+__device__ __forceinline__ uint32_t ld_gbl_ca(const __restrict__ uint32_t *addr) {
+  uint32_t return_value;
+  asm("ld.global.ca.u32 %0, [%1];" : "=r"(return_value) : "l"(addr));
+  return return_value;
+}
+
+__device__ __forceinline__ uint32_t ld_gbl_cs(const __restrict__ uint32_t *addr) {
+  uint32_t return_value;
+  asm("ld.global.cs.u32 %0, [%1];" : "=r"(return_value) : "l"(addr));
+  return return_value;
+}
+
+__device__ __forceinline__ void st_gbl_wt(const __restrict__ uint32_t *addr, const uint32_t value) {
+  asm("st.global.wt.u32 [%0], %1;" :: "l"(addr), "r"(value));
+}
+
+__device__ __forceinline__ void st_gbl_cs(const __restrict__ uint32_t *addr, const uint32_t value) {
+  asm("st.global.cs.u32 [%0], %1;" :: "l"(addr), "r"(value));
+}
+
+__device__ __forceinline__ uint32_t gpu_get_sm_idx(){
+  uint32_t smid;
+  asm volatile("mov.u32 %0, %%smid;" : "=r"(smid));
+  return(smid);
+}
+
+__device__ __forceinline__ void uint32_to_uchars(const uint32_t s, int *u1, int *u2, int *u3, int *u4) {
+  //*u1 = s & 0xff;
+  *u1 = __byte_perm(s, 0, 0x4440);
+  //*u2 = (s>>8) & 0xff;
+  *u2 = __byte_perm(s, 0, 0x4441);
+  //*u3 = (s>>16) & 0xff;
+  *u3 = __byte_perm(s, 0, 0x4442);
+  //*u4 = s>>24;
+  *u4 = __byte_perm(s, 0, 0x4443);
+}
+
+__device__ __forceinline__ uint32_t uchars_to_uint32(int u1, int u2, int u3, int u4) {
+  //return u1 | (u2<<8) | (u3<<16) | (u4<<24);
+  //return __byte_perm(u1, u2, 0x7740) + __byte_perm(u3, u4, 0x4077);
+  return u1 | (u2<<8) | __byte_perm(u3, u4, 0x4077);
+}
+
+__device__ __forceinline__ uint32_t uchar_to_uint32(int u1) {
+  return __byte_perm(u1, u1, 0x0);
+}
+
+__device__ __forceinline__ unsigned int vcmpgeu4(unsigned int a, unsigned int b) {
+    unsigned int r, c;
+    c = a-b;
+    asm ("prmt.b32 %0,%1,0,0xba98;" : "=r"(r) : "r"(c));// build mask from msbs
+    return r;           // byte-wise unsigned gt-eq comparison with mask result
+}
+
+__device__ __forceinline__ unsigned int vminu4(unsigned int a, unsigned int b) {
+    unsigned int r, s;
+    s = vcmpgeu4 (b, a);// mask = 0xff if a >= b
+    r = a & s;          // select a when b >= a
+    s = b & ~s;         // select b when b < a
+    r = r | s;          // combine byte selections
+    return r;
+}
+
+__device__ __forceinline__ void print_uchars(const char* str, const uint32_t s) {
+  int u1, u2, u3, u4;
+  uint32_to_uchars(s, &u1, &u2, &u3, &u4);
+  printf("%s: %d %d %d %d\n", str, u1, u2, u3, u4);
+}
+
+template<class T>
+__device__ __forceinline__ int popcount(T n) {
+#if CSCT or CSCT_RECOMPUTE
+  return __popc(n);
+#else
+  return __popcll(n);
+#endif
+}
+
+__inline__ __device__ uint8_t minu8_index4(int *min_idx, const uint8_t val1, const int dis, const uint8_t val2, const int dis2, const uint8_t val3, const int dis3, const uint8_t val4, const int dis4) {
+  int min_idx1 = dis;
+  uint8_t min1 = val1;
+  if(val1 > val2) {
+    min1 = val2;
+    min_idx1 = dis2;
+  }
+
+  int min_idx2 = dis3;
+  uint8_t min2 = val3;
+  if(val3 > val4) {
+    min2 = val4;
+    min_idx2 = dis4;
+  }
+
+  uint8_t minval = min1;
+  *min_idx = min_idx1;
+  if(min1 > min2) {
+    minval = min2;
+    *min_idx = min_idx2;
+  }
+  return minval;
+}
+
+__inline__ __device__ uint8_t minu8_index8(int *min_idx, const uint8_t val1, const int dis, const uint8_t val2, const int dis2, const uint8_t val3, const int dis3, const uint8_t val4, const int dis4, const uint8_t val5, const int dis5, const uint8_t val6, const int dis6, const uint8_t val7, const int dis7, const uint8_t val8, const int dis8) {
+  int min_idx1, min_idx2;
+  uint8_t minval1, minval2;
+
+  minval1 = minu8_index4(&min_idx1, val1, dis, val2, dis2, val3, dis3, val4, dis4);
+  minval2 = minu8_index4(&min_idx2, val5, dis5, val6, dis6, val7, dis7, val8, dis8);
+
+  *min_idx = min_idx1;
+  uint8_t minval = minval1;
+  if(minval1 > minval2) {
+    *min_idx = min_idx2;
+    minval = minval2;
+  }
+  return minval;
+}
+
+__inline__ __device__ int warpReduceMinIndex2(int *val, int idx) {
+  for(int d = 1; d < WARP_SIZE; d *= 2) {
+    int tmp = shfl_xor_32(*val, d);
+    int tmp_idx = shfl_xor_32(idx, d);
+    if(*val > tmp) {
+      *val = tmp;
+      idx = tmp_idx;
+    }
+  }
+  return idx;
+}
+
+__inline__ __device__ int warpReduceMinIndex(int val, int idx) {
+  for(int d = 1; d < WARP_SIZE; d *= 2) {
+    int tmp = shfl_xor_32(val, d);
+    int tmp_idx = shfl_xor_32(idx, d);
+    if(val > tmp) {
+      val = tmp;
+      idx = tmp_idx;
+    }
+  }
+  return idx;
+}
+
+__inline__ __device__ int warpReduceMin(int val) {
+  val = min(val, shfl_xor_32(val, 1));
+  val = min(val, shfl_xor_32(val, 2));
+  val = min(val, shfl_xor_32(val, 4));
+  val = min(val, shfl_xor_32(val, 8));
+  val = min(val, shfl_xor_32(val, 16));
+  return val;
+}
+
+__inline__ __device__ int blockReduceMin(int val) {
+  static __shared__ int shared[WARP_SIZE]; // Shared mem for WARP_SIZE partial sums
+  const int lane = threadIdx.x % WARP_SIZE;
+  const int wid = threadIdx.x / WARP_SIZE;
+
+  val = warpReduceMin(val);     // Each warp performs partial reduction
+
+  if (lane==0) shared[wid]=val; // Write reduced value to shared memory
+
+  __syncthreads();              // Wait for all partial reductions
+
+  //read from shared memory only if that warp existed
+  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : INT_MAX;
+
+  if (wid==0) val = warpReduceMin(val); //Final reduce within first warp
+
+  return val;
+}
+
+__inline__ __device__ int blockReduceMinIndex(int val, int idx) {
+  static __shared__ int shared_val[WARP_SIZE]; // Shared mem for WARP_SIZE partial mins
+  static __shared__ int shared_idx[WARP_SIZE]; // Shared mem for WARP_SIZE indexes
+  const int lane = threadIdx.x % WARP_SIZE;
+  const int wid = threadIdx.x / WARP_SIZE;
+
+  idx = warpReduceMinIndex2(&val, idx);     // Each warp performs partial reduction
+
+  if (lane==0) {
+    shared_val[wid]=val;
+    shared_idx[wid]=idx;
+  }
+
+  __syncthreads();              // Wait for all partial reductions
+
+  //read from shared memory only if that warp existed
+  val = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared_val[lane] : INT_MAX;
+  idx = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared_idx[lane] : INT_MAX;
+
+  if (wid==0) {
+    idx = warpReduceMinIndex2(&val, idx); //Final reduce within first warp
+  }
+
+  return idx;
+}
+
+
+__inline__ __device__ bool blockAny(bool local_condition) {
+  __shared__ bool conditions[WARP_SIZE];
+  const int lane = threadIdx.x % WARP_SIZE;
+  const int wid = threadIdx.x / WARP_SIZE;
+
+  local_condition = __any_sync(0xffffffff, local_condition);     // Each warp performs __any
+
+  if (lane==0) {
+    conditions[wid]=local_condition;
+  }
+
+  __syncthreads();              // Wait for all partial __any
+
+  //read from shared memory only if that warp existed
+  local_condition = (threadIdx.x < blockDim.x / WARP_SIZE) ? conditions[lane] : false;
+
+  if (wid==0) {
+    local_condition = __any_sync(0xffffffff, local_condition); //Final __any within first warp
+  }
+
+  return local_condition;
+}
+
+} //namespace sgm_gpu
+
+#endif // SGM_GPU__UTIL_H_
+