misc,phvu

Doubt in Viterbi implementation

Hi,

I am confused wrt lines 26 and 27 of the viterbi implementation:
trellis[:, t] = (trellis[:, t-1, None].dot(self.Obs(obs[t]).T) * self.transProb).max(0) backpt[:, t] = (np.tile(trellis[:, t-1, None], [1, self.N]) * self.transProb).argmax(0)

I am not able to understand why there is no dot product happening in the second line. Why multiply only with transProb without taking obs into account?

Please help.

The program got stuck, when I expand datasets to distributed_tf_sample.py

Hi, Thanks you for give an sample example for distribute tensorflow program.But my program get stuck, when MNIST dataset is magnified 5-fold. And It is ok for dataset, which is smaller than MNIST magnified 5-fold.
the file distributed_tf_sample.py, which I used, as follows:

import math
import time

import tensorflow as tf
# from tensorflow.examples.tutorials.mnist import input_data
import input_data_nd
from mnist_nd import read_data_sets
import pdb

WORKER_TASKS = ['/job:worker/task:0', '/job:worker/task:1', '/job:worker/task:2']#, '/job:worker/task:1', '/job:worker/task:2'
PS_DEVICE = '/job:ps/task:0'
GRPC_SERVER = "grpc://localhost:2222"

HIDDEN_SIZE = 512
LAYERS = 2
ITERATIONS = 1000
EVAL_EVERY = 50
BATCH_SIZE = 64
L2_REGULARIZER = 1E-5


class Model(object):
    def __init__(self, variable_device, variable_scope, worker_device, worker_name_scope, reuse, optimizer):
        weights = []
        biases = []
        with tf.device(variable_device):
            with tf.variable_scope(variable_scope, reuse=reuse):
                for i in range(0, LAYERS + 1):
                    ws = [784 if i == 0 else HIDDEN_SIZE, 10 if i == LAYERS else HIDDEN_SIZE]

                    # Xavier initialization
                    weights.append(tf.get_variable("weight{}".format(i), ws,
                                                   initializer=tf.truncated_normal_initializer(
                                                       stddev=math.sqrt(3.0 / (ws[0] + ws[1])))))
                    biases.append(tf.get_variable("bias{}".format(i), ws[1:],
                                                  initializer=tf.constant_initializer(value=0, dtype=tf.float32)))
            # tf.get_variable_scope().reuse_variables()

        with tf.device(worker_device):
            with tf.name_scope(worker_name_scope):
                self.inp = tf.placeholder(tf.float32, shape=[None, 784])
                self.labels = tf.placeholder(tf.float32, shape=[None, 10])

                self.hiddens = []
                for i in range(0, LAYERS):
                    x = self.inp if i == 0 else self.hiddens[i - 1]
                    self.hiddens.append(tf.nn.relu(tf.matmul(x, weights[i]) + biases[i]))

                self.predicts = tf.nn.softmax(tf.matmul(self.hiddens[-1], weights[-1]) + biases[-1])

                regularizers = tf.add_n([tf.nn.l2_loss(x) for x in weights + biases])

                self.loss = (-tf.reduce_sum(self.labels * tf.log(tf.maximum(self.predicts, 1E-10))) +
                             L2_REGULARIZER * regularizers)

                self.correct_prediction = tf.equal(tf.argmax(self.predicts, 1), tf.argmax(self.labels, 1))
                self.accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32))

                self.grad_op = optimizer.compute_gradients(self.loss)

        self.my_device = worker_device
        self.my_name_scope = worker_name_scope


with tf.Graph().as_default():
    batch_idx = tf.Variable(0, trainable=False)

    # learning_rate = tf.train.exponential_decay(0.01, batch_idx * BATCH_SIZE,
    #                                            decay_steps=1000, decay_rate=0.95, staircase=True)
    # opt = tf.train.MomentumOptimizer(learning_rate, 0.9)
    opt = tf.train.GradientDescentOptimizer(0.01)

    models = []

    for i, w in enumerate(WORKER_TASKS):
        models.append(Model(variable_device=PS_DEVICE,
                            variable_scope='mnist_variables',
                            worker_device=w, worker_name_scope='worker_name_scope_{}'.format(i),
                            reuse=i != 0,
                            optimizer=opt))

    # compute gradients of workers
    all_grads_and_vars = []
    all_losses = []
    for m in models:
        all_grads_and_vars.append(m.grad_op)
        all_losses.append(m.loss)

    # compute average gradient
    if len(all_grads_and_vars) > 1:
        average_grads = []
        for grads_and_vars in zip(*all_grads_and_vars):
            grads = []
            for g, _ in grads_and_vars:
                grads.append(tf.expand_dims(g, 0))

            grad = tf.reduce_mean(tf.concat(0, grads), 0)
            average_grads.append((grad, grads_and_vars[0][1]))
    else:
        average_grads = all_grads_and_vars[0]

    average_loss = tf.add_n(all_losses) / len(all_losses)

    # apply the average gradient
    apply_gradient_op = opt.apply_gradients(average_grads, global_step=batch_idx)

    init_op = tf.initialize_all_variables()

    #gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4)
    #sess_config = tf.ConfigProto(log_device_placement=True, gpu_options=gpu_options)
    sess_config = tf.ConfigProto(log_device_placement=True)
    #with tf.Session(GRPC_SERVER, config=sess_config) as sess:
    with tf.Session(GRPC_SERVER, config=sess_config) as sess:
        print "Begin to build graph..."
        # pdb.set_trace()
        sess.run(init_op)
        print "Begin to load dataset"
        mnist = read_data_sets("MNIST_data/", one_hot=True)
        print "Finish load mnist datasets"
        training_time = time.time()
        for i in range(ITERATIONS):
            feeds = {}
            for m in models:
                # data-parallel
                batch_xs, batch_ys = mnist.train.next_batch(BATCH_SIZE)
                feeds[m.inp] = batch_xs
                feeds[m.labels] = batch_ys
                pdb.set_trace()
            print "Begin to train model in "+str(i)+' iterations...'
            _, avg_loss = sess.run([apply_gradient_op, average_loss], feed_dict=feeds)
            print "Finish to train " + str(i) + " iterations..."
            if i % EVAL_EVERY == 0:
                # we only need to evaluate on a single model, because there is only one set of parameters
                m = models[0]
                acc = sess.run(m.accuracy, feed_dict={m.inp: mnist.validation.images,
                                                      m.labels: mnist.validation.labels})
                print 'Step {}: validation accuracy = {}, average training loss = {}'.format(
                    tf.train.global_step(sess, batch_idx), acc, avg_loss)

        training_time = time.time() - training_time
        print 'Training time: {} seconds'.format(training_time)

        m = models[0]
        acc = sess.run(m.accuracy, feed_dict={m.inp: mnist.test.images, m.labels: mnist.test.labels})
        print 'Test accuracy: {}'.format(acc)

The commind Line as follows:
In ps machine:
import tensorflow as tf
cluster = tf.train.ClusterSpec({"ps": ["yilaguan_1:2222"],"worker":["yilaguan_1:2223", "yilaguan_2:2224"]})
server = tf.train.Server(cluster, job_name="ps", task_index=0)
server.join()
In worker 1:
import tensorflow as tf
cluster = tf.train.ClusterSpec({"ps": ["yilaguan_1:2222"],"worker":["yilaguan_1:2223", "yilaguan_2:2224"]})
server = tf.train.Server(cluster, job_name="worker", task_index=0)
In worker 2:
import tensorflow as tf
cluster = tf.train.ClusterSpec({"ps": ["yilaguan_1:2222"],"worker":["yilaguan_1:2223", "yilaguan_2:2224"]})
server = tf.train.Server(cluster, job_name="worker", task_index=1)
In worker 1:
python distributed_tf_sample.py

Modify file as follows:
In mnist file, I make some change:

def expand_data(wait_expand, data, mul_number):
  for i in range(mul_number):
    wait_expand = numpy.append(wait_expand, data, 0)
  return wait_expand



def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=tf.float32):
  class DataSets(object):
    pass
  data_sets = DataSets()

  if fake_data:
    def fake():
      return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype)
    data_sets.train = fake()
    data_sets.validation = fake()
    data_sets.test = fake()
    return data_sets

  TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
  TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
  TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
  TEST_LABELS = 't10k-labels-idx1-ubyte.gz'
  VALIDATION_SIZE = 5000

  local_file = maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES)
  train_images = extract_images(local_file)

  local_file = maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS)
  train_labels = extract_labels(local_file, one_hot=one_hot)

  local_file = maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES)
  test_images = extract_images(local_file)

  local_file = maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS)
  test_labels = extract_labels(local_file, one_hot=one_hot)

  mul_number = 4

  validation_images = train_images[:VALIDATION_SIZE]
  validation_images = expand_data(validation_images, train_images[:VALIDATION_SIZE], mul_number)

  validation_labels = train_labels[:VALIDATION_SIZE]
  validation_labels = expand_data(validation_labels, train_labels[:VALIDATION_SIZE], mul_number)

  train_images = train_images[VALIDATION_SIZE:]
  train_images = expand_data(train_images, train_images[VALIDATION_SIZE:], mul_number)

  train_labels = train_labels[VALIDATION_SIZE:]
  train_labels = expand_data(train_labels, train_labels[VALIDATION_SIZE:], mul_number)

  data_sets.train = DataSet(train_images, train_labels, dtype=dtype)
  data_sets.validation = DataSet(validation_images, validation_labels,
                                 dtype=dtype)
  data_sets.test = DataSet(test_images, test_labels, dtype=dtype)
  return data_sets

The ERROR I getted as below:

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
Finish load mnist datasets
Begin to train model in 0 iterations...

But the program get stuck here.And I try to find the log in worker, I get the message as follow:

[libprotobuf WARNING google/protobuf/src/google/protobuf/io/coded_stream.cc:569] Reading dangerously large protocol message.  If the message turns out to be larger than 67108864 bytes, parsing will be halted for security reasons.  To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
[libprotobuf ERROR google/protobuf/src/google/protobuf/io/coded_stream.cc:207] A protocol message was rejected because it was too big (more than 67108864 bytes).  To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
[libprotobuf WARNING google/protobuf/src/google/protobuf/io/coded_stream.cc:81] The total number of bytes read was 67108864
E tensorflow/core/framework/tensor.cc:105] Input size was 67108839 and expected 78400000

Could you tell me why will cause this problem?

phvu / misc Goto Github PK

misc's People

Contributors

Stargazers

Watchers

Forkers

misc's Issues

Doubt in Viterbi implementation

The program got stuck, when I expand datasets to distributed_tf_sample.py

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent