We’re having a lot of trouble hacking together a reinforcement learning version of this, so we are taking an alternative approacg, inspired by wtaching the MIT underactuated robotics course.

http://underactuated.csail.mit.edu/underactuated.html?chapter=acrobot

It took some pen and paper to get the equations of motion (which are maybe right?).

openai gym has

We switch over to LQR when the y position of the pole is above a certain height

https://en.wikipedia.org/wiki/Linear%E2%80%93quadratic_regulator

This scipy function solves the algebriac ricatti equation in the ocntinous time infite horizon section

https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.linalg.solve_continuous_are.html

Things that helped: Trying to balance pole first from upright position then from downright.

Tuning weights for theta and thetadot. Thetadot was too small made it unstable

Hacked in the LQR control by adjusting force_mag variable. Nasty.

Put it some slight compensation for a delayed observation, which reflects our actual sensor system

from custom_cartpole_delay import CartPoleEnv

import gym

import numpy as np
import scipy.linalg as linalg
lqr = linalg.solve_continuous_are

env = CartPoleEnv()
env.buffer_size = 5
gravity = 9.8
masscart = 1.0
masspole = 0.1
total_mass = (masspole + masscart)
length = 0.5 # actually half the pole's length
polemass_length = (masspole * length)
force_mag = 10.0
tau = 0.02 

def E(x):
	return 1/2 * masspole * (2 * length)**2 / 3 *  x[3]**2 + np.cos(x[2]) * polemass_length * gravity
def u(x):
	#print(E(x)-Ed)
	return 1.0 * (E(x)-Ed) * x[3] * np.cos(x[2])


H = np.array([
	[1,0,0,0],
	[0,total_mass, 0, - polemass_length],
	[0,0,1,0],
	[0,- polemass_length, 0, (2 * length)**2 * masspole /3]
	])

Hinv = np.linalg.inv(H)

A = Hinv @ np.array([
    [0,1,0,0],
    [0,0,0,0],
    [0,0,0,1],
    [0,0, - polemass_length * gravity, 0]
	])
B = Hinv @ np.array([0,1.0,0,0]).reshape((4,1))
Q = np.diag([0.1, 1.0, 100.0, 5.0])
R = np.array([[0.1]])

P = lqr(A,B,Q,R)
Rinv = np.linalg.inv(R)
K = Rinv @ B.T @ P
print(K)
def ulqr(x):
	x1 = np.copy(x)
	x1[2] = np.sin(x1[2])
	return np.dot(K, x1)

Ed = E([0,0,0,0])

observation = env.reset()
for _ in range(1000):
  observation = np.copy(observation)
  env.render()
  observation[0] += observation[1] * tau * env.buffer_size
  observation[2] += observation[3] * tau * env.buffer_size
  observation[3] += np.sin(observation[2]) * tau * env.buffer_size * gravity  / (length * 2) / 2
  #action = env.action_space.sample() # your agent here (this takes random actions)
  a = u(observation) - 0.3 * observation[0] -  0.1 * observation[1]
  if  abs(E(observation)-Ed) < 0.1 and np.cos(observation[2]) > 0.6: #abs(E(observation)-Ed) < 0.1 and
    
  	a = ulqr(observation)
  	env.force_mag = min(abs(a[0]), 10)
  	#print(a)
  else:
  	env.force_mag = 10.0
  if a < 0:
  	action = 0
  else:
  	action = 1



  observation, reward, done, info = env.step(action)







import math
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np
import logging
logger = logging.getLogger(__name__)
 
class CartPoleEnv(gym.Env):
    metadata = {
        'render.modes': ['human', 'rgb_array'],
        'video.frames_per_second' : 50
    }
 
    def __init__(self):
        self.gravity = 9.8
        self.masscart = 1.0
        self.masspole = 0.1
        self.total_mass = (self.masspole + self.masscart)
        self.length = 0.5 # actually half the pole's length
        self.polemass_length = (self.masspole * self.length)
        self.force_mag = 3.0
        self.tau = 0.02  # seconds between state updates
 
        # Angle at which to fail the episode
        # we expect full swings
        self.theta_threshold_radians =  np.pi  #12 * 2 * math.pi / 360
        self.x_threshold = 2.4
 
        # Angle limit set to 2 * theta_threshold_radians so failing observation is still within bounds
        high = np.array([
            self.x_threshold * 2,
            np.finfo(np.float32).max,
            self.theta_threshold_radians * 2,
            np.finfo(np.float32).max])
        high2 = np.array([1])
        self.action_space = spaces.Discrete(2) #spaces.Box(-high2, high2)# 
        self.observation_space = spaces.Box(-high, high)
 
        self._seed()
        self.viewer = None
        self.state = None
        self.buffer_size = 0

 
        self.steps_beyond_done = None
        self.steps = 0
        self.num_envs = 1
        self.viewer = None
    viewer = None
    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]
 
    def _step(self, action):
        #assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action))
        #print(action)
        #action =action[0] # max(-1, min(action[0],1))
        state = self.state
        x, x_dot, theta, theta_dot = state
        force = self.force_mag if action==1 else -self.force_mag
        #force = self.force_mag * action
        #print(action)
        #print(state)
 
        x  = x + self.tau * x_dot
        theta = theta + self.tau * theta_dot
        costheta = math.cos(theta)
        sintheta = math.sin(theta)
        temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass
        thetaacc = (self.gravity * sintheta - costheta* temp) / (self.length * (4.0/3.0 - self.masspole * costheta * costheta / self.total_mass))
        xacc  = temp - self.polemass_length * thetaacc * costheta / self.total_mass
        
        x_dot = x_dot + self.tau * xacc
        
        theta_dot = theta_dot + self.tau * thetaacc
        self.state = (x,x_dot,theta,theta_dot)
        
        done =  x < -self.x_threshold \
                or x > self.x_threshold \
                or theta < -np.pi * 5 \
                or theta > np.pi * 5 \
                or self.steps > 1024
        done = bool(done)
        
        self.steps += 1
        limit = 200
 
        reward = 0.0
        if  x < -self.x_threshold or x > self.x_threshold:
            reward -= 1.0
        reward += (np.cos(theta)+1)**2 / 4
        #reward -= 0.1 * action**2 
        reward += -0.1 * x**2
        if np.cos(theta) > 0.95:
            reward += 1
        #reward = reward/2048
        '''
        if not done:
            reward = 1.0
        elif self.steps_beyond_done is None:
            # Pole just fell!
            self.steps_beyond_done = 0
            reward = 1.0
        else:
            if self.steps_beyond_done == 0:
                logger.warning("You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.")
            self.steps_beyond_done += 1
            reward = 0.0
        '''
        #print(np.array(self.state).reshape((1,-1)), reward, done, {})

        #obs = np.array([x,x_dot,np.cos(theta),np.sin(theta),theta_dot])# + self.action_buffer)
        self.buffer.append(np.copy(self.state))
        obs2 = self.buffer.pop(0)
        return obs2, reward, done, {}
 
    def _reset(self):
        #self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,))
        #x, xdot, theta, thetadot
        self.state = np.array([0, 0, np.pi+0.2 , 0]) # np.pi+0.2 + self.np_random.uniform(low=-1.0, high=1.0, size=(4,))
        self.steps_beyond_done = None
        self.steps = 0
        self.buffer = []
        for i in range(self.buffer_size):
            self.buffer.append(self.state)
        return np.array(self.state)
 
    def _render(self, mode='human', close=False):
        if close:
            if self.viewer is not None:
                self.viewer.close()
                self.viewer = None
            return
 
        screen_width = 600
        screen_height = 400
 
        world_width = self.x_threshold*2
        scale = screen_width/world_width
        carty = 100 # TOP OF CART
        polewidth = 10.0
        polelen = scale * 1.0
        cartwidth = 50.0
        cartheight = 30.0
 
        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.Viewer(screen_width, screen_height)
            l,r,t,b = -cartwidth/2, cartwidth/2, cartheight/2, -cartheight/2
            axleoffset =cartheight/4.0
            cart = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)])
            self.carttrans = rendering.Transform()
            cart.add_attr(self.carttrans)
            self.viewer.add_geom(cart)
            l,r,t,b = -polewidth/2,polewidth/2,polelen-polewidth/2,-polewidth/2
            pole = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)])
            pole.set_color(.8,.6,.4)
            self.poletrans = rendering.Transform(translation=(0, axleoffset))
            pole.add_attr(self.poletrans)
            pole.add_attr(self.carttrans)
            self.viewer.add_geom(pole)
            self.axle = rendering.make_circle(polewidth/2)
            self.axle.add_attr(self.poletrans)
            self.axle.add_attr(self.carttrans)
            self.axle.set_color(.5,.5,.8)
            self.viewer.add_geom(self.axle)
            self.track = rendering.Line((0,carty), (screen_width,carty))
            self.track.set_color(0,0,0)
            self.viewer.add_geom(self.track)
 
        if self.state is None: return None
 
        x = self.state
        cartx = x[0]*scale+screen_width/2.0 # MIDDLE OF CART
        self.carttrans.set_translation(cartx, carty)
        self.poletrans.set_rotation(-x[2])
 
        return self.viewer.render(return_rgb_array = mode=='rgb_array')