Historical stock prices analysis
Today I’ve played with historical stock prices. There are some models of prices movement that describe it as brownian motion.(Actually prices follow log-normal behavior).
My plan was following:
1. Get historical data.(from 1980 to modern times)
2. Get distribution of log(S_i/S_i+1)
3. Fit it to gaussian distribution.
I got historical data from Yahoo using next script:
#!/usr/bin/ruby
require 'rubygems'
require 'net/http'
require 'csv'
require 'date'
require 'optparse'
require 'ostruct'
response = nil
startDate = Date.new(1980,1,1)
endDate = Date.new(2012,1,1)
def getInfo(symbols, startDate, endDate)
proxy = ENV['http_proxy'] ? URI.parse( ENV['http_proxy'] ) : OpenStruct.new
Net::HTTP::Proxy( proxy.host, proxy.port, proxy.user, proxy.password ).start( "itable.finance.yahoo.com", 80 ) do |http|
begin
http.read_timeout = 30
query = "/table.csv?s=#{symbols}&g=d" +
"&a=#{startDate.month-1}&b=#{startDate.mday}&c=#{startDate.year}" +
"&d=#{endDate.month-1}&e=#{endDate.mday}&f=#{endDate.year.to_s}"
response = http.get( query )
body = response.body.chomp
# If we don't get the first line like this, there was something
# wrong with the data (404 error, new data formet, etc).
return 0 if body !~ /Date,Open,High,Low,Close,Volume,Adj Close/
# Parse into an array of arrays.
rows = CSV.parse( body , :col_sep => ",")
# Remove the first array since it is just the field headers.
rows.shift
File.open("data/#{symbols}","w+") do |file|
rows.each{|line|
file.puts line.to_s
}
end
puts rows.length
return 0
rescue Timeout::Error
puts "Timeout error for quote #{symbols}"
return -1
end
end
end
def generate_quotes()
quotes = File::readlines("quotes.csv")
quotes
end
quotes = generate_quotes()
num = quotes.size
threads = []
num.times{|i|
quote = quotes[i].chomp.downcase
threads << Thread.new(quote){
puts "#{i} #{quote}"
while (getInfo(quote, startDate, endDate)!=0) do
end
}
}
threads.each{ |aThread| aThread.join}
Here we make simple http-object with with query for quotes specified in file quotes.csv. Results saved in
“data” directory with files named “quote”.
Second I wrote another ruby-script to analyze that data:
#!/usr/bin/ruby
require 'rubygems'
require 'gsl'
MAX = 1.0
Dir::chdir("data")
files = Dir::glob("*")
num = files.size
def get_histogram file_name
h = GSL::Histogram::alloc(120, [-MAX, MAX])
File.open(file_name, "r") do |f1|
s = f1.readlines
n = s.size - 1
n.times{|index|
i = index + 1
a = (s[i-1]).split(",")[6].to_f
b = (s[i]).split(",")[6].to_f
next if (a==0) || (b==0)
dif = Math::log(b / a)
wgt = s[i].split(",")[5].to_f
h.fill(dif, wgt)
}
end
h = h.normalize
return h
end
files.each{|file_name|
h = get_histogram file_name
sigma, mean, height, errsig, errmean, errhei, sumsq, dof = h.fit_gaussian
x = GSL::Vector.linspace(-MAX, MAX, 120)
y = height*GSL::Ran::gaussian_pdf(x-mean, sigma)
z = height*GSL::Ran::gaussian_pdf(x-h.mean, h.sigma)
puts "#{file_name}, #{h.mean}, #{h.sigma}, #{mean}, #{sigma}, #{errsig}, #{errmean}, #{errhei}, #{sumsq}, #{dof}, #{h.max_val}, #{h.min_val}"
GSL::graph(h, [x, y], [x,z], "-T png -L #{file_name.upcase} -x -0.5 0.5 -C -g 3 - > #{file_name}.png")
h.fprintf("#{file_name}.hist", "%4.4f", "%4.4f")
}
For each file in data-directory we find log of ratio of two adj. close prices and put it in histogram object weighted by volume. After that we fit this histogram to gaussian function(using least squares and with mean/sigma from distribution), plot this functions to png-images(red line is real-data, green-line -l.s.f, blue – m/s from distribution) and output fitting data to standard output.
There are a lot of data so I’ve ordered it by several parameters. First errors, minimum were for companies aac, mwnaf and others. Typical distribution for them is following:
So this comes from low variation of prices. Maximum errors were for companies, like ICOP, UOMO, ORS and so on. ICOP distribution looks like this:

Here fitting errors come from high irregularities of data and it looks not gaussian-like.
Actually some companies fit good:

Also I’ve examined most of these pictures(>3000 files). Most of data can be divided in two parts: that fitted good and usually have not so big sigma and others have high pick and heavy tails/several humps:

So in future I’d like to find some functions that better fit all data.
Playing with clusters
Today I want to talk about coagulation and Monte-Carlo method to model such process. Suppose we have particles and every such particle(or cluster) consists from elementary atoms of one kind. So
atoms gluing together give us a
-cluster. From time to time two clusters can collide, stick together and form a bigger cluster:
Probability of such collision in time interval depends only from size of clusters
and
and equals
. Such processes are described by Smoluchowski coagulation equation:
Here we have ‘balance equation’: -clusters are made from collision of
- and
-clusters, but burn after collision with any other cluster(
comes from counting clusters twice, e.g. 1+4 and 4+1).
There are different ways to follow evolution of such systems, here I describe a Monte-Carlo way. Lets call particle a pair of two numbers , where
is number of real
-clusters, that represents our particle
(think of
like president of countries or sport team captain). Now we have to define how our particles would interact with each other. With probability
particles
and
will collide in time
. But collision is a pair process, so only
clusters from each particle would interact(suppose
):
It can be easily proven, that our model of particles corresponds Smoluchowski equation. Total number of
-cluster is:
[...] – is Iverson bracket. Average -clusters gain on step
comes from collision of
and
-clusters:
As second term goes to zero and we get first part of Smoluchowski equation. Same thing with decrease of
-clusters.
In order to check our model lets write simple Ocaml program and do the calculation.
open Printf;;
type particle = {mutable index:int; mutable weight: float};;
let kernel x y = x + y;;
let realizations = 1000;;
let rec create_cell num_particles ind conc =
Array.init num_particles (fun i -> {index = ind; weight = conc /. (float num_particles)});;
let rec create_process num_cells num_particles ind conc =
Array.init num_cells (fun i -> create_cell num_particles ind conc);;
let iter_each_particle process f =
Array.iter (fun cell -> Array.iter (f) cell) process;;
let get_conc cam =
let x = ref 0.0 in
iter_each_particle cam (fun p -> x := !x +.p.weight);
!x/.(float realizations);;
let mixture cam =
let krp = Array.length cam in
iter_each_particle cam (fun p ->
let r_krp = Random.int krp in
let r_nm = Random.int (Array.length cam.(r_krp)) in
let r_p = (cam.(r_krp)).(r_nm) in
let tmp_ind = r_p.index in
let tmp_wt = r_p.weight in
r_p.index <- p.index;
r_p.weight <- p.weight;
p.index <- tmp_ind;
p.weight <- tmp_wt;
);;
let rec iter_over_pairs func cell =
let n = Array.length cell in
for i = 0 to (n-2) do
for j = (i+1) to (n-1) do
func cell.(i) cell.(j)
done;
done;;
let coll dt a b =
if ((Random.float 1.0) < dt *. (max a.weight b.weight) *. 2.0 *. (float (kernel a.index b.index)))
then if (a.weight < b.weight)
then (b.weight <- b.weight -. a.weight; a.index <- a.index + b.index)
else if (b.weight < a.weight)
then (a.weight <- a.weight -. b.weight; b.index <- b.index + a.index)
else (let wt = a.weight *. 0.5 in
let ind = a.index + b.index in
b.weight <- wt; a.weight <- wt;
a.index <- ind; b.index <- ind);;
let rec run_exp total_time curr_step out_step total_steps cam =
if curr_step <= total_steps
then (
let dt = total_time /. (float total_steps) in
if (curr_step mod out_step = 0)
then (printf "%f; %f;\n" ((float_of_int curr_step) *. dt) (get_conc cam));
Array.iter (fun cell -> iter_over_pairs (coll dt) cell) cam;
mixture cam;
run_exp total_time (curr_step + 1) out_step total_steps cam;
);;
let _ =
let cam = create_process realizations 2 1 1.0 in
let num_steps = 1000 in
let dt = 3.0 in
run_exp dt 0 2 num_steps cam;;
This program prints total number of clusters with additive kernel: , timestep 3/1000 and monodisperse initial conditions. Analytical solution is 1/exp(t), and we have a good match:
Some useful articles are:
