Whereas HMMs assume that sequence elements are generated from hidden variables organized in a flat linear structure, PCFGs assume that the hidden variables can be organized hierarchically. This allows for richer representations. For instance, an HMM would have a hard time with the sentence:

Joan saw the big old scary man with binoculars.

Here, there are two possible readings (either Joan has the binoculars or the man does) but an HMM would have a hard time capturing this because the information involved in this inference is so far apart; “Joan” is at the beginning of the sentence and “with binoculars” is at the end.

Sampling from a PCFG

// extract words from a single string
// assumes that whitespace is the delimiter
var w = function(string) {
  return string.split(/ +/);
}

var grammar = {
  "S":  map(w, ["x OP S", "S OP x", "N"]),
  "OP": map(w, ["+", "-", "*"]),
  "N":  map(w, ["0", "1", "2", "3", "4", "5", "6"])
};

var isTerminal = function(symbol) {
  return !(_.contains(["S","OP","N"], symbol))
}

var unfold = function(symbol) {
  if (isTerminal(symbol)) {
    return symbol
  }
  
  var rules = grammar[symbol];
  var sampledRule = rules[ randomInteger(rules.length) ];
  
  return [symbol].concat( map(unfold, sampledRule) );
}

var reformatTree = function(x) {
  if (Array.isArray(x)) {
    return "[" + map(reformatTree, x).join(" ") + "]"
  }
  return x
}

repeat(5,
       function() { return reformatTree(unfold("S")) })

(Plug in the output of reformatTree to Miles Shang’s syntax tree generator for visualization)

Application: parsing ambiguous sentences

/*

 S -> NP VP
 NP -> N | D N | D N PP
 VP -> V NP | V' PP
 V' -> V NP
 PP -> P NP
 
 */

var w = function(string) {
  return string.split(/ +/);
}

var grammar = {
  "S":  map(w, ["NP VP"]),
  "NP": map(w, ["N", "D N", "D N PP"]),
  "VP": map(w, ["V NP", "Vb PP"]),
  "Vb": map(w, ["V NP"]),
  "PP": map(w, ["P N"]),
  "P":  map(w, ["with"]),
  "N":  map(w, ["Joan", "man", "binoculars"]),
  "V":  map(w, ["saw"]),
  "D":  map(w, ["the"]) 
};

// get the leaves of a parse tree
var getLeaves = function(x) {
  if (x.length > 1) {
    return reduce(function(x, acc) { return getLeaves(x).concat(acc)},
                  [],
                  x.slice(1));
  }
  
  return [ x[0] ]
}

var reformatTree = function(x) {
  if (Array.isArray(x)) {
    return "[" + map(reformatTree, x).join(" ") + "]"
  }
  return x
}

var preTerminals = ["P", "N", "V", "D"];
var terminals = _.flatten(map(function(x) { return grammar[x]},
                              ["P", "N", "V", "D"]
                             ));

// is a symbol a terminal?
var isTerminal = function(symbol) {
  return _.contains(terminals, symbol);
}

// is a symbol a pre-terminal?
// i.e., does it only rewrite to terminals?
var isPreTerminal = function(symbol) {
  return _.contains(["P","N","V","D"], symbol);
}

// rewrite a preterminal to a terminal
var samplePreTerminal = function(symbol) {
  var rules = grammar[symbol];
  return rules[ randomInteger(rules.length) ];
}

var _unfold = function(symbol) {
  var yieldLeft = globalStore.yieldLeft;
  if (yieldLeft.length == 0) {
    factor(-Infinity);
  }
  if (isPreTerminal(symbol)) {
    var terminal = samplePreTerminal(symbol);
    factor(terminal == yieldLeft[0] ? 0 : -Infinity);
    globalStore.yieldLeft =  yieldLeft.slice(1);
    return [symbol, terminal];
  }
  var rules = grammar[symbol];
  var sampledRewrite = rules[ randomInteger(rules.length) ];
  return [symbol].concat(map(_unfold, sampledRewrite));

};

var unfold = function() {
  return _.flatten(_unfold("S")).join(" ");
}

var erp = ParticleFilter(function() {
  globalStore.yieldLeft = "Joan saw the man with binoculars".split(" ");
  var x = _unfold("S");
  factor(globalStore.yieldLeft.length == 0 ? 0 : -Infinity);
  return x;
}, 1000);

reformatTree( sample(erp) )

(Plug in the output of reformatTree to Miles Shang’s syntax tree generator for visualization)