Finite State Machine to segment Thai syllables

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;







up vote
3
down vote

favorite












I am developing a FSM to segment Thai syllables with the following rules:



enter image description here



Here are my util functions:



package edu.washington.rippeth.ling473

package object proj3

private[proj3] def optionallyAppend(sb: StringBuilder, char: Option[Char]): StringBuilder = char match
case Some(c) =>
sb.append(c)
sb
case None =>
sb


/** Inserts a space before the last element.
*
* @param sb the string to which a space should be added
* @return the string with a space at the n-1st position
*/
private[proj3] def addSpaceBeforeLast(sb: StringBuilder): StringBuilder =
sb.insert(sb.length-1, ' ')
sb


/** Appends a space to the StringBuilder.
*
* @param sb the StringBuilder to which a space should be added
* @return the StringBuilder with a space at the end
*/
private[proj3] def addSpaceAtEnd(sb: StringBuilder): StringBuilder =
sb.append(' ')
sb





Here are my states:



package edu.washington.rippeth.ling473.proj3

private[proj3] sealed trait State
/**
* The string builder before a character is potentially applied.
*/
protected def stringBuilderIn: StringBuilder

/**
* The character which can trigger an update in state.
*
* Creating an Option[Char] is a hack because the empty char literal does not
* exist in Scala.
*/
protected def char: Option[Char]

/**
* An optional function which can transform the tape after the string builder
* may have been updated. This will be `None` in the normal case.
*/
protected def postProcessesStringBuilder: Option[StringBuilder => StringBuilder]

/**
* If the state is a "normal" state
* @return
*/
def stringBuilderOut: StringBuilder = postProcessesStringBuilder match
case Some(f) => f(optionallyAppend(stringBuilderIn, char))
case None => optionallyAppend(stringBuilderIn, char)



private[proj3] sealed trait NormalState extends State
/**
* In the normal case, we don't post process the string builder -- potentially adding
* the character is good enough.
* @return
*/
final protected def postProcessesStringBuilder: Option[StringBuilder => StringBuilder] = None


private[proj3] sealed trait StateWithPenultimateSpace extends State
/**
* In state 7 and 8, a space is to be added before the penultimate character,
* so this will be the post-processing function.
*/
final protected def postProcessesStringBuilder: Option[StringBuilder => StringBuilder] =
Some(addSpaceBeforeLast)


private[proj3] sealed trait StateWithFinalSpace extends State
/**
* In state 9, a space is to be appended to the string, so this will be
* the post-processing function.
*/
final protected def postProcessesStringBuilder: Option[StringBuilder => StringBuilder] =
Some(addSpaceAtEnd)


// State0 through State6 are very vanilla -- just appending
// a character to the output string
private[proj3] final case class State0(stringBuilderIn: StringBuilder, char: Option[Char])
extends NormalState
private[proj3] final case class State1(stringBuilderIn: StringBuilder, char: Option[Char])
extends NormalState
private[proj3] final case class State2(stringBuilderIn: StringBuilder, char: Option[Char])
extends NormalState
private[proj3] final case class State3(stringBuilderIn: StringBuilder, char: Option[Char])
extends NormalState
private[proj3] final case class State4(stringBuilderIn: StringBuilder, char: Option[Char])
extends NormalState
private[proj3] final case class State5(stringBuilderIn: StringBuilder, char: Option[Char])
extends NormalState
private[proj3] final case class State6(stringBuilderIn: StringBuilder, char: Option[Char])
extends NormalState

// State7 and State8 are special -- add a space before the last character
// in the output string
private[proj3] final case class State7(stringBuilderIn: StringBuilder, char: Option[Char])
extends StateWithPenultimateSpace
private[proj3] final case class State8(stringBuilderIn: StringBuilder, char: Option[Char])
extends StateWithPenultimateSpace

// State9 is also special -- add a space to the end of the output string
private[proj3] final case class State9(stringBuilderIn: StringBuilder, char: Option[Char])
extends StateWithFinalSpace


And here is my state machine:



package edu.washington.rippeth.ling473.proj3

import com.typesafe.scalalogging.LazyLogging

/** Given input lines, moves through the state machine for
* each line
*
* @param lines the unsegmented Thai lines
*/
class StateMachine(lines: Iterable[String]) extends LazyLogging

// The categories that force transitions
private final val V1: Set[Char] = "เแโใไ".toSet
private final val C1: Set[Char] = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮ".toSet
private final val C2: Set[Char] = "รลวนม".toSet
private final val V2: Set[Char] = "◌ิ◌ี◌ึ◌ื◌ุ◌ู◌ั◌็".toSet
private final val T: Set[Char] = Set('u0E48', 'u0E49', 'u0E4A', 'u0E4B')
private final val V3: Set[Char] = "าอยว".toSet
private final val C3: Set[Char] = "งนมดบกยว".toSet

// The following methods define the actions to be performed
// (i.e., defining to which state the machine should transition)
// The naming convention is actionN where N is the state before
// transition

private def action0(implicit oldState: State, c: Char): State =
if(V1.contains(c))
logger.trace("Going to state 1")
State1(oldState.stringBuilderOut, Some(c))
else if(C1.contains(c))
logger.trace("Going to state 2")
State2(oldState.stringBuilderOut, Some(c))
else
throw new IllegalArgumentException(s"Invalid input '$c'")

private def action1(oldState: State, c: Char): State =
if(C1.contains(c))
logger.trace("Going to state 2")
State2(oldState.stringBuilderOut, Some(c))
else
throw new IllegalArgumentException(s"Invalid input '$c'")

private def action2(oldState: State, c: Char): State =
if(C2.contains(c))
logger.trace("Going to state 3")
State3(oldState.stringBuilderOut, Some(c))
else if(V2.contains(c))
logger.trace("Going to state 4")
State4(oldState.stringBuilderOut, Some(c))
else if(T.contains(c))
logger.trace("Going to state 5")
State5(oldState.stringBuilderOut, Some(c))
else if(V3.contains(c))
logger.trace("Going to state 6")
State6(oldState.stringBuilderOut, Some(c))
else if(C3.contains(c))
logger.trace("Going to state 9")
State9(oldState.stringBuilderOut, Some(c))
else if(V1.contains(c))
logger.trace("Going to state 7")
State7(oldState.stringBuilderOut, Some(c))
else if(C1.contains(c))
logger.trace("Going to state 8")
State8(oldState.stringBuilderOut, Some(c))
else
throw new IllegalArgumentException(s"Invalid input '$c'")

private def action3(oldState: State, c: Char): State =
if(V2.contains(c))
logger.trace("Going to state 4")
State4(oldState.stringBuilderOut, Some(c))
else if(T.contains(c))
logger.trace("Going to state 5")
State5(oldState.stringBuilderOut, Some(c))
else if(V3.contains(c))
logger.trace("Going to state 6")
State6(oldState.stringBuilderOut, Some(c))
else if(C3.contains(c))
logger.trace("Going to state 9")
State9(oldState.stringBuilderOut, Some(c))
else
throw new IllegalArgumentException(s"Invalid input '$c'")

private def action4(oldState: State, c: Char): State =
if(T.contains(c))
logger.trace("Going to state 5")
State5(oldState.stringBuilderOut, Some(c))
else if(V3.contains(c))
logger.trace("Going to state 6")
State6(oldState.stringBuilderOut, Some(c))
else if(C3.contains(c))
logger.trace("Going to state 9")
State9(oldState.stringBuilderOut, Some(c))
else if(V1.contains(c))
logger.trace("Going to state 7")
State7(oldState.stringBuilderOut, Some(c))
else if(C1.contains(c))
logger.trace("Going to state 8")
State8(oldState.stringBuilderOut, Some(c))
else
throw new IllegalArgumentException(s"Invalid input '$c'")

private def action5(oldState: State, c: Char): State =
if(V3.contains(c))
logger.trace("Going to state 6")
State6(oldState.stringBuilderOut, Some(c))
else if(C3.contains(c))
logger.trace("Going to state 9")
State9(oldState.stringBuilderOut, Some(c))
else if(V1.contains(c))
logger.trace("Going to state 7")
State7(oldState.stringBuilderOut, Some(c))
else if(C1.contains(c))
logger.trace("Going to state 8")
State8(oldState.stringBuilderOut, Some(c))
else
throw new IllegalArgumentException(s"Invalid input '$c'")

private def action6(oldState: State, c: Char): State =
if(C3.contains(c))
logger.trace("Going to state 9")
State9(oldState.stringBuilderOut, Some(c))
else if(V1.contains(c))
logger.trace("Going to state 7")
State7(oldState.stringBuilderOut, Some(c))
else if(C1.contains(c))
logger.trace("Going to state 8")
State8(oldState.stringBuilderOut, Some(c))
else
throw new IllegalArgumentException(s"Invalid input '$c'")

// Because actions 7, 8, and 9 don't consume input,
// they will simply be a pass-through to the action of the
// state for which they are acting as a proxy
private def action7(oldState: State, c: Char): State = action1(oldState, c)
private def action8(oldState: State, c: Char): State = action2(oldState, c)
private def action9(oldState: State, c: Char): State = action0(oldState, c)

/** Given a state and a character, choose the next action to perform
* (i.e., choose the next state)
*
* @param state the state of the machine right now
* @param c the character triggering a transition
* @return the new state of the machine
*/
private def segmentationFunction(state: State, c: Char): State = state match
case s: State0 => action0(s, c)
case s: State1 => action1(s, c)
case s: State2 => action2(s, c)
case s: State3 => action3(s, c)
case s: State4 => action4(s, c)
case s: State5 => action5(s, c)
case s: State6 => action6(s, c)
case s: State7 => action7(s, c)
case s: State8 => action8(s, c)
case s: State9 => action9(s, c)


logger.trace("In state 0")
private final def initialState: State = State0(new StringBuilder, None)

/** For a given line, this function will work through
* the entire transition of state in the state machine.
*
* @param line the line being processed
* @return the final state
*/
private def transition(line: String, initialState: State): State =
// Given a state (with initial value of State0(new StringBuilder)) and a character
// (left to right in the string), apply segmentationFunction. Update
// the state and move to next character in the line.
val state = line.foldLeft(initialState) (state, c) =>
logger.trace(s"Handling $c")
segmentationFunction(state, c)

logger.debug(s"Processed line: $state.stringBuilderOut.toString()")
state


/** Apply transition to the given line and return the output string from
* the resultant state
*
* @param line the line being processed
* @return the output string of the final state
*/
private def segmentLine(line: String): String = transition(line, initialState).stringBuilderOut.toString

/** Segments all lines
*
* @return the SegmentedLines of all the inputs
*/
def segmentLines: SegmentedLines = new SegmentedLines(lines.map(segmentLine))



Here, SegmentedLines is just a class that will transform the output into a standard form -- it is beyond the scope of the review, but for completeness, it's below:



package edu.washington.rippeth.ling473.proj3

import java.io.BufferedWriter, File, FileWriter

class SegmentedLines(l: Iterable[String]) <meta http-equiv='Content-Type' content='text/html; charset=UTF-8' />



My tests are here:



package edu.washington.rippeth.ling473.proj3

import org.scalatest.FlatSpec

class StateMachineTest extends FlatSpec

"A StateMachine" should "segment the example" in

val input: Seq[String] = Seq("แบ่งแผ่นดินออกเป็นสองสว่น", "หลุมพอต้นใหญ่งอกงามชิดตลิ่ง")
val expectedOutput = Seq("แบ่ง แผ่น ดิน ออก เป็น สอง สว่น", "หลุม พอ ต้น ให ญ่ง อก งาม ชิด ตลิ่ง")
val stateMachine: StateMachine = new StateMachine(input)

assertResult(expectedOutput)
stateMachine.segmentLines.lines






and



package edu.washington.rippeth.ling473.proj3

import org.scalatest.Matchers, WordSpecLike

class UtilsSpec extends WordSpecLike with Matchers

private def withCleanStringBuilder(block: StringBuilder => Any): Unit =
val sb = new StringBuilder
block(sb)


"optionallyAppend" should
"append when the character is not None" in withCleanStringBuilder sb =>
sb shouldBe empty
val charToAdd = 'c'
val char @ Some(c) = Some(charToAdd)
val newSb = optionallyAppend(sb, char)
newSb.length should===(1)
newSb.headOption shouldBe char


"not append when the character is None" in withCleanStringBuilder sb =>
sb shouldBe empty
val char = None
val newSb = optionallyAppend(sb, char)
newSb shouldBe empty



"addSpaceBeforeLast" should
"add a space before the last character" in withCleanStringBuilder sb =>
val (first, last) = ("hell", "o")
val totalLength = first.length + last.length
sb.appendAll(s"$first$last")
val newSb = addSpaceBeforeLast(sb)
// Added a space, so length is updated
newSb.length should===(totalLength + 1)
val expectedString = s"$first $last"
newSb.toString should===(expectedString)



"addSpaceAtEnd" should
"add a space at the last character" in withCleanStringBuilder sb =>
val expectedString = "hello "
sb.appendAll(expectedString.trim)
val newSb = addSpaceAtEnd(sb)
newSb.toString should===(expectedString)






I'm interested in simplifying the states if possible and also general best practices. This implementation is fast enough™, but any performance improvements would be helpful (though I don't expect crazy benchmarking).







share|improve this question



























    up vote
    3
    down vote

    favorite












    I am developing a FSM to segment Thai syllables with the following rules:



    enter image description here



    Here are my util functions:



    package edu.washington.rippeth.ling473

    package object proj3

    private[proj3] def optionallyAppend(sb: StringBuilder, char: Option[Char]): StringBuilder = char match
    case Some(c) =>
    sb.append(c)
    sb
    case None =>
    sb


    /** Inserts a space before the last element.
    *
    * @param sb the string to which a space should be added
    * @return the string with a space at the n-1st position
    */
    private[proj3] def addSpaceBeforeLast(sb: StringBuilder): StringBuilder =
    sb.insert(sb.length-1, ' ')
    sb


    /** Appends a space to the StringBuilder.
    *
    * @param sb the StringBuilder to which a space should be added
    * @return the StringBuilder with a space at the end
    */
    private[proj3] def addSpaceAtEnd(sb: StringBuilder): StringBuilder =
    sb.append(' ')
    sb





    Here are my states:



    package edu.washington.rippeth.ling473.proj3

    private[proj3] sealed trait State
    /**
    * The string builder before a character is potentially applied.
    */
    protected def stringBuilderIn: StringBuilder

    /**
    * The character which can trigger an update in state.
    *
    * Creating an Option[Char] is a hack because the empty char literal does not
    * exist in Scala.
    */
    protected def char: Option[Char]

    /**
    * An optional function which can transform the tape after the string builder
    * may have been updated. This will be `None` in the normal case.
    */
    protected def postProcessesStringBuilder: Option[StringBuilder => StringBuilder]

    /**
    * If the state is a "normal" state
    * @return
    */
    def stringBuilderOut: StringBuilder = postProcessesStringBuilder match
    case Some(f) => f(optionallyAppend(stringBuilderIn, char))
    case None => optionallyAppend(stringBuilderIn, char)



    private[proj3] sealed trait NormalState extends State
    /**
    * In the normal case, we don't post process the string builder -- potentially adding
    * the character is good enough.
    * @return
    */
    final protected def postProcessesStringBuilder: Option[StringBuilder => StringBuilder] = None


    private[proj3] sealed trait StateWithPenultimateSpace extends State
    /**
    * In state 7 and 8, a space is to be added before the penultimate character,
    * so this will be the post-processing function.
    */
    final protected def postProcessesStringBuilder: Option[StringBuilder => StringBuilder] =
    Some(addSpaceBeforeLast)


    private[proj3] sealed trait StateWithFinalSpace extends State
    /**
    * In state 9, a space is to be appended to the string, so this will be
    * the post-processing function.
    */
    final protected def postProcessesStringBuilder: Option[StringBuilder => StringBuilder] =
    Some(addSpaceAtEnd)


    // State0 through State6 are very vanilla -- just appending
    // a character to the output string
    private[proj3] final case class State0(stringBuilderIn: StringBuilder, char: Option[Char])
    extends NormalState
    private[proj3] final case class State1(stringBuilderIn: StringBuilder, char: Option[Char])
    extends NormalState
    private[proj3] final case class State2(stringBuilderIn: StringBuilder, char: Option[Char])
    extends NormalState
    private[proj3] final case class State3(stringBuilderIn: StringBuilder, char: Option[Char])
    extends NormalState
    private[proj3] final case class State4(stringBuilderIn: StringBuilder, char: Option[Char])
    extends NormalState
    private[proj3] final case class State5(stringBuilderIn: StringBuilder, char: Option[Char])
    extends NormalState
    private[proj3] final case class State6(stringBuilderIn: StringBuilder, char: Option[Char])
    extends NormalState

    // State7 and State8 are special -- add a space before the last character
    // in the output string
    private[proj3] final case class State7(stringBuilderIn: StringBuilder, char: Option[Char])
    extends StateWithPenultimateSpace
    private[proj3] final case class State8(stringBuilderIn: StringBuilder, char: Option[Char])
    extends StateWithPenultimateSpace

    // State9 is also special -- add a space to the end of the output string
    private[proj3] final case class State9(stringBuilderIn: StringBuilder, char: Option[Char])
    extends StateWithFinalSpace


    And here is my state machine:



    package edu.washington.rippeth.ling473.proj3

    import com.typesafe.scalalogging.LazyLogging

    /** Given input lines, moves through the state machine for
    * each line
    *
    * @param lines the unsegmented Thai lines
    */
    class StateMachine(lines: Iterable[String]) extends LazyLogging

    // The categories that force transitions
    private final val V1: Set[Char] = "เแโใไ".toSet
    private final val C1: Set[Char] = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮ".toSet
    private final val C2: Set[Char] = "รลวนม".toSet
    private final val V2: Set[Char] = "◌ิ◌ี◌ึ◌ื◌ุ◌ู◌ั◌็".toSet
    private final val T: Set[Char] = Set('u0E48', 'u0E49', 'u0E4A', 'u0E4B')
    private final val V3: Set[Char] = "าอยว".toSet
    private final val C3: Set[Char] = "งนมดบกยว".toSet

    // The following methods define the actions to be performed
    // (i.e., defining to which state the machine should transition)
    // The naming convention is actionN where N is the state before
    // transition

    private def action0(implicit oldState: State, c: Char): State =
    if(V1.contains(c))
    logger.trace("Going to state 1")
    State1(oldState.stringBuilderOut, Some(c))
    else if(C1.contains(c))
    logger.trace("Going to state 2")
    State2(oldState.stringBuilderOut, Some(c))
    else
    throw new IllegalArgumentException(s"Invalid input '$c'")

    private def action1(oldState: State, c: Char): State =
    if(C1.contains(c))
    logger.trace("Going to state 2")
    State2(oldState.stringBuilderOut, Some(c))
    else
    throw new IllegalArgumentException(s"Invalid input '$c'")

    private def action2(oldState: State, c: Char): State =
    if(C2.contains(c))
    logger.trace("Going to state 3")
    State3(oldState.stringBuilderOut, Some(c))
    else if(V2.contains(c))
    logger.trace("Going to state 4")
    State4(oldState.stringBuilderOut, Some(c))
    else if(T.contains(c))
    logger.trace("Going to state 5")
    State5(oldState.stringBuilderOut, Some(c))
    else if(V3.contains(c))
    logger.trace("Going to state 6")
    State6(oldState.stringBuilderOut, Some(c))
    else if(C3.contains(c))
    logger.trace("Going to state 9")
    State9(oldState.stringBuilderOut, Some(c))
    else if(V1.contains(c))
    logger.trace("Going to state 7")
    State7(oldState.stringBuilderOut, Some(c))
    else if(C1.contains(c))
    logger.trace("Going to state 8")
    State8(oldState.stringBuilderOut, Some(c))
    else
    throw new IllegalArgumentException(s"Invalid input '$c'")

    private def action3(oldState: State, c: Char): State =
    if(V2.contains(c))
    logger.trace("Going to state 4")
    State4(oldState.stringBuilderOut, Some(c))
    else if(T.contains(c))
    logger.trace("Going to state 5")
    State5(oldState.stringBuilderOut, Some(c))
    else if(V3.contains(c))
    logger.trace("Going to state 6")
    State6(oldState.stringBuilderOut, Some(c))
    else if(C3.contains(c))
    logger.trace("Going to state 9")
    State9(oldState.stringBuilderOut, Some(c))
    else
    throw new IllegalArgumentException(s"Invalid input '$c'")

    private def action4(oldState: State, c: Char): State =
    if(T.contains(c))
    logger.trace("Going to state 5")
    State5(oldState.stringBuilderOut, Some(c))
    else if(V3.contains(c))
    logger.trace("Going to state 6")
    State6(oldState.stringBuilderOut, Some(c))
    else if(C3.contains(c))
    logger.trace("Going to state 9")
    State9(oldState.stringBuilderOut, Some(c))
    else if(V1.contains(c))
    logger.trace("Going to state 7")
    State7(oldState.stringBuilderOut, Some(c))
    else if(C1.contains(c))
    logger.trace("Going to state 8")
    State8(oldState.stringBuilderOut, Some(c))
    else
    throw new IllegalArgumentException(s"Invalid input '$c'")

    private def action5(oldState: State, c: Char): State =
    if(V3.contains(c))
    logger.trace("Going to state 6")
    State6(oldState.stringBuilderOut, Some(c))
    else if(C3.contains(c))
    logger.trace("Going to state 9")
    State9(oldState.stringBuilderOut, Some(c))
    else if(V1.contains(c))
    logger.trace("Going to state 7")
    State7(oldState.stringBuilderOut, Some(c))
    else if(C1.contains(c))
    logger.trace("Going to state 8")
    State8(oldState.stringBuilderOut, Some(c))
    else
    throw new IllegalArgumentException(s"Invalid input '$c'")

    private def action6(oldState: State, c: Char): State =
    if(C3.contains(c))
    logger.trace("Going to state 9")
    State9(oldState.stringBuilderOut, Some(c))
    else if(V1.contains(c))
    logger.trace("Going to state 7")
    State7(oldState.stringBuilderOut, Some(c))
    else if(C1.contains(c))
    logger.trace("Going to state 8")
    State8(oldState.stringBuilderOut, Some(c))
    else
    throw new IllegalArgumentException(s"Invalid input '$c'")

    // Because actions 7, 8, and 9 don't consume input,
    // they will simply be a pass-through to the action of the
    // state for which they are acting as a proxy
    private def action7(oldState: State, c: Char): State = action1(oldState, c)
    private def action8(oldState: State, c: Char): State = action2(oldState, c)
    private def action9(oldState: State, c: Char): State = action0(oldState, c)

    /** Given a state and a character, choose the next action to perform
    * (i.e., choose the next state)
    *
    * @param state the state of the machine right now
    * @param c the character triggering a transition
    * @return the new state of the machine
    */
    private def segmentationFunction(state: State, c: Char): State = state match
    case s: State0 => action0(s, c)
    case s: State1 => action1(s, c)
    case s: State2 => action2(s, c)
    case s: State3 => action3(s, c)
    case s: State4 => action4(s, c)
    case s: State5 => action5(s, c)
    case s: State6 => action6(s, c)
    case s: State7 => action7(s, c)
    case s: State8 => action8(s, c)
    case s: State9 => action9(s, c)


    logger.trace("In state 0")
    private final def initialState: State = State0(new StringBuilder, None)

    /** For a given line, this function will work through
    * the entire transition of state in the state machine.
    *
    * @param line the line being processed
    * @return the final state
    */
    private def transition(line: String, initialState: State): State =
    // Given a state (with initial value of State0(new StringBuilder)) and a character
    // (left to right in the string), apply segmentationFunction. Update
    // the state and move to next character in the line.
    val state = line.foldLeft(initialState) (state, c) =>
    logger.trace(s"Handling $c")
    segmentationFunction(state, c)

    logger.debug(s"Processed line: $state.stringBuilderOut.toString()")
    state


    /** Apply transition to the given line and return the output string from
    * the resultant state
    *
    * @param line the line being processed
    * @return the output string of the final state
    */
    private def segmentLine(line: String): String = transition(line, initialState).stringBuilderOut.toString

    /** Segments all lines
    *
    * @return the SegmentedLines of all the inputs
    */
    def segmentLines: SegmentedLines = new SegmentedLines(lines.map(segmentLine))



    Here, SegmentedLines is just a class that will transform the output into a standard form -- it is beyond the scope of the review, but for completeness, it's below:



    package edu.washington.rippeth.ling473.proj3

    import java.io.BufferedWriter, File, FileWriter

    class SegmentedLines(l: Iterable[String]) <meta http-equiv='Content-Type' content='text/html; charset=UTF-8' />



    My tests are here:



    package edu.washington.rippeth.ling473.proj3

    import org.scalatest.FlatSpec

    class StateMachineTest extends FlatSpec

    "A StateMachine" should "segment the example" in

    val input: Seq[String] = Seq("แบ่งแผ่นดินออกเป็นสองสว่น", "หลุมพอต้นใหญ่งอกงามชิดตลิ่ง")
    val expectedOutput = Seq("แบ่ง แผ่น ดิน ออก เป็น สอง สว่น", "หลุม พอ ต้น ให ญ่ง อก งาม ชิด ตลิ่ง")
    val stateMachine: StateMachine = new StateMachine(input)

    assertResult(expectedOutput)
    stateMachine.segmentLines.lines






    and



    package edu.washington.rippeth.ling473.proj3

    import org.scalatest.Matchers, WordSpecLike

    class UtilsSpec extends WordSpecLike with Matchers

    private def withCleanStringBuilder(block: StringBuilder => Any): Unit =
    val sb = new StringBuilder
    block(sb)


    "optionallyAppend" should
    "append when the character is not None" in withCleanStringBuilder sb =>
    sb shouldBe empty
    val charToAdd = 'c'
    val char @ Some(c) = Some(charToAdd)
    val newSb = optionallyAppend(sb, char)
    newSb.length should===(1)
    newSb.headOption shouldBe char


    "not append when the character is None" in withCleanStringBuilder sb =>
    sb shouldBe empty
    val char = None
    val newSb = optionallyAppend(sb, char)
    newSb shouldBe empty



    "addSpaceBeforeLast" should
    "add a space before the last character" in withCleanStringBuilder sb =>
    val (first, last) = ("hell", "o")
    val totalLength = first.length + last.length
    sb.appendAll(s"$first$last")
    val newSb = addSpaceBeforeLast(sb)
    // Added a space, so length is updated
    newSb.length should===(totalLength + 1)
    val expectedString = s"$first $last"
    newSb.toString should===(expectedString)



    "addSpaceAtEnd" should
    "add a space at the last character" in withCleanStringBuilder sb =>
    val expectedString = "hello "
    sb.appendAll(expectedString.trim)
    val newSb = addSpaceAtEnd(sb)
    newSb.toString should===(expectedString)






    I'm interested in simplifying the states if possible and also general best practices. This implementation is fast enough™, but any performance improvements would be helpful (though I don't expect crazy benchmarking).







    share|improve this question























      up vote
      3
      down vote

      favorite









      up vote
      3
      down vote

      favorite











      I am developing a FSM to segment Thai syllables with the following rules:



      enter image description here



      Here are my util functions:



      package edu.washington.rippeth.ling473

      package object proj3

      private[proj3] def optionallyAppend(sb: StringBuilder, char: Option[Char]): StringBuilder = char match
      case Some(c) =>
      sb.append(c)
      sb
      case None =>
      sb


      /** Inserts a space before the last element.
      *
      * @param sb the string to which a space should be added
      * @return the string with a space at the n-1st position
      */
      private[proj3] def addSpaceBeforeLast(sb: StringBuilder): StringBuilder =
      sb.insert(sb.length-1, ' ')
      sb


      /** Appends a space to the StringBuilder.
      *
      * @param sb the StringBuilder to which a space should be added
      * @return the StringBuilder with a space at the end
      */
      private[proj3] def addSpaceAtEnd(sb: StringBuilder): StringBuilder =
      sb.append(' ')
      sb





      Here are my states:



      package edu.washington.rippeth.ling473.proj3

      private[proj3] sealed trait State
      /**
      * The string builder before a character is potentially applied.
      */
      protected def stringBuilderIn: StringBuilder

      /**
      * The character which can trigger an update in state.
      *
      * Creating an Option[Char] is a hack because the empty char literal does not
      * exist in Scala.
      */
      protected def char: Option[Char]

      /**
      * An optional function which can transform the tape after the string builder
      * may have been updated. This will be `None` in the normal case.
      */
      protected def postProcessesStringBuilder: Option[StringBuilder => StringBuilder]

      /**
      * If the state is a "normal" state
      * @return
      */
      def stringBuilderOut: StringBuilder = postProcessesStringBuilder match
      case Some(f) => f(optionallyAppend(stringBuilderIn, char))
      case None => optionallyAppend(stringBuilderIn, char)



      private[proj3] sealed trait NormalState extends State
      /**
      * In the normal case, we don't post process the string builder -- potentially adding
      * the character is good enough.
      * @return
      */
      final protected def postProcessesStringBuilder: Option[StringBuilder => StringBuilder] = None


      private[proj3] sealed trait StateWithPenultimateSpace extends State
      /**
      * In state 7 and 8, a space is to be added before the penultimate character,
      * so this will be the post-processing function.
      */
      final protected def postProcessesStringBuilder: Option[StringBuilder => StringBuilder] =
      Some(addSpaceBeforeLast)


      private[proj3] sealed trait StateWithFinalSpace extends State
      /**
      * In state 9, a space is to be appended to the string, so this will be
      * the post-processing function.
      */
      final protected def postProcessesStringBuilder: Option[StringBuilder => StringBuilder] =
      Some(addSpaceAtEnd)


      // State0 through State6 are very vanilla -- just appending
      // a character to the output string
      private[proj3] final case class State0(stringBuilderIn: StringBuilder, char: Option[Char])
      extends NormalState
      private[proj3] final case class State1(stringBuilderIn: StringBuilder, char: Option[Char])
      extends NormalState
      private[proj3] final case class State2(stringBuilderIn: StringBuilder, char: Option[Char])
      extends NormalState
      private[proj3] final case class State3(stringBuilderIn: StringBuilder, char: Option[Char])
      extends NormalState
      private[proj3] final case class State4(stringBuilderIn: StringBuilder, char: Option[Char])
      extends NormalState
      private[proj3] final case class State5(stringBuilderIn: StringBuilder, char: Option[Char])
      extends NormalState
      private[proj3] final case class State6(stringBuilderIn: StringBuilder, char: Option[Char])
      extends NormalState

      // State7 and State8 are special -- add a space before the last character
      // in the output string
      private[proj3] final case class State7(stringBuilderIn: StringBuilder, char: Option[Char])
      extends StateWithPenultimateSpace
      private[proj3] final case class State8(stringBuilderIn: StringBuilder, char: Option[Char])
      extends StateWithPenultimateSpace

      // State9 is also special -- add a space to the end of the output string
      private[proj3] final case class State9(stringBuilderIn: StringBuilder, char: Option[Char])
      extends StateWithFinalSpace


      And here is my state machine:



      package edu.washington.rippeth.ling473.proj3

      import com.typesafe.scalalogging.LazyLogging

      /** Given input lines, moves through the state machine for
      * each line
      *
      * @param lines the unsegmented Thai lines
      */
      class StateMachine(lines: Iterable[String]) extends LazyLogging

      // The categories that force transitions
      private final val V1: Set[Char] = "เแโใไ".toSet
      private final val C1: Set[Char] = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮ".toSet
      private final val C2: Set[Char] = "รลวนม".toSet
      private final val V2: Set[Char] = "◌ิ◌ี◌ึ◌ื◌ุ◌ู◌ั◌็".toSet
      private final val T: Set[Char] = Set('u0E48', 'u0E49', 'u0E4A', 'u0E4B')
      private final val V3: Set[Char] = "าอยว".toSet
      private final val C3: Set[Char] = "งนมดบกยว".toSet

      // The following methods define the actions to be performed
      // (i.e., defining to which state the machine should transition)
      // The naming convention is actionN where N is the state before
      // transition

      private def action0(implicit oldState: State, c: Char): State =
      if(V1.contains(c))
      logger.trace("Going to state 1")
      State1(oldState.stringBuilderOut, Some(c))
      else if(C1.contains(c))
      logger.trace("Going to state 2")
      State2(oldState.stringBuilderOut, Some(c))
      else
      throw new IllegalArgumentException(s"Invalid input '$c'")

      private def action1(oldState: State, c: Char): State =
      if(C1.contains(c))
      logger.trace("Going to state 2")
      State2(oldState.stringBuilderOut, Some(c))
      else
      throw new IllegalArgumentException(s"Invalid input '$c'")

      private def action2(oldState: State, c: Char): State =
      if(C2.contains(c))
      logger.trace("Going to state 3")
      State3(oldState.stringBuilderOut, Some(c))
      else if(V2.contains(c))
      logger.trace("Going to state 4")
      State4(oldState.stringBuilderOut, Some(c))
      else if(T.contains(c))
      logger.trace("Going to state 5")
      State5(oldState.stringBuilderOut, Some(c))
      else if(V3.contains(c))
      logger.trace("Going to state 6")
      State6(oldState.stringBuilderOut, Some(c))
      else if(C3.contains(c))
      logger.trace("Going to state 9")
      State9(oldState.stringBuilderOut, Some(c))
      else if(V1.contains(c))
      logger.trace("Going to state 7")
      State7(oldState.stringBuilderOut, Some(c))
      else if(C1.contains(c))
      logger.trace("Going to state 8")
      State8(oldState.stringBuilderOut, Some(c))
      else
      throw new IllegalArgumentException(s"Invalid input '$c'")

      private def action3(oldState: State, c: Char): State =
      if(V2.contains(c))
      logger.trace("Going to state 4")
      State4(oldState.stringBuilderOut, Some(c))
      else if(T.contains(c))
      logger.trace("Going to state 5")
      State5(oldState.stringBuilderOut, Some(c))
      else if(V3.contains(c))
      logger.trace("Going to state 6")
      State6(oldState.stringBuilderOut, Some(c))
      else if(C3.contains(c))
      logger.trace("Going to state 9")
      State9(oldState.stringBuilderOut, Some(c))
      else
      throw new IllegalArgumentException(s"Invalid input '$c'")

      private def action4(oldState: State, c: Char): State =
      if(T.contains(c))
      logger.trace("Going to state 5")
      State5(oldState.stringBuilderOut, Some(c))
      else if(V3.contains(c))
      logger.trace("Going to state 6")
      State6(oldState.stringBuilderOut, Some(c))
      else if(C3.contains(c))
      logger.trace("Going to state 9")
      State9(oldState.stringBuilderOut, Some(c))
      else if(V1.contains(c))
      logger.trace("Going to state 7")
      State7(oldState.stringBuilderOut, Some(c))
      else if(C1.contains(c))
      logger.trace("Going to state 8")
      State8(oldState.stringBuilderOut, Some(c))
      else
      throw new IllegalArgumentException(s"Invalid input '$c'")

      private def action5(oldState: State, c: Char): State =
      if(V3.contains(c))
      logger.trace("Going to state 6")
      State6(oldState.stringBuilderOut, Some(c))
      else if(C3.contains(c))
      logger.trace("Going to state 9")
      State9(oldState.stringBuilderOut, Some(c))
      else if(V1.contains(c))
      logger.trace("Going to state 7")
      State7(oldState.stringBuilderOut, Some(c))
      else if(C1.contains(c))
      logger.trace("Going to state 8")
      State8(oldState.stringBuilderOut, Some(c))
      else
      throw new IllegalArgumentException(s"Invalid input '$c'")

      private def action6(oldState: State, c: Char): State =
      if(C3.contains(c))
      logger.trace("Going to state 9")
      State9(oldState.stringBuilderOut, Some(c))
      else if(V1.contains(c))
      logger.trace("Going to state 7")
      State7(oldState.stringBuilderOut, Some(c))
      else if(C1.contains(c))
      logger.trace("Going to state 8")
      State8(oldState.stringBuilderOut, Some(c))
      else
      throw new IllegalArgumentException(s"Invalid input '$c'")

      // Because actions 7, 8, and 9 don't consume input,
      // they will simply be a pass-through to the action of the
      // state for which they are acting as a proxy
      private def action7(oldState: State, c: Char): State = action1(oldState, c)
      private def action8(oldState: State, c: Char): State = action2(oldState, c)
      private def action9(oldState: State, c: Char): State = action0(oldState, c)

      /** Given a state and a character, choose the next action to perform
      * (i.e., choose the next state)
      *
      * @param state the state of the machine right now
      * @param c the character triggering a transition
      * @return the new state of the machine
      */
      private def segmentationFunction(state: State, c: Char): State = state match
      case s: State0 => action0(s, c)
      case s: State1 => action1(s, c)
      case s: State2 => action2(s, c)
      case s: State3 => action3(s, c)
      case s: State4 => action4(s, c)
      case s: State5 => action5(s, c)
      case s: State6 => action6(s, c)
      case s: State7 => action7(s, c)
      case s: State8 => action8(s, c)
      case s: State9 => action9(s, c)


      logger.trace("In state 0")
      private final def initialState: State = State0(new StringBuilder, None)

      /** For a given line, this function will work through
      * the entire transition of state in the state machine.
      *
      * @param line the line being processed
      * @return the final state
      */
      private def transition(line: String, initialState: State): State =
      // Given a state (with initial value of State0(new StringBuilder)) and a character
      // (left to right in the string), apply segmentationFunction. Update
      // the state and move to next character in the line.
      val state = line.foldLeft(initialState) (state, c) =>
      logger.trace(s"Handling $c")
      segmentationFunction(state, c)

      logger.debug(s"Processed line: $state.stringBuilderOut.toString()")
      state


      /** Apply transition to the given line and return the output string from
      * the resultant state
      *
      * @param line the line being processed
      * @return the output string of the final state
      */
      private def segmentLine(line: String): String = transition(line, initialState).stringBuilderOut.toString

      /** Segments all lines
      *
      * @return the SegmentedLines of all the inputs
      */
      def segmentLines: SegmentedLines = new SegmentedLines(lines.map(segmentLine))



      Here, SegmentedLines is just a class that will transform the output into a standard form -- it is beyond the scope of the review, but for completeness, it's below:



      package edu.washington.rippeth.ling473.proj3

      import java.io.BufferedWriter, File, FileWriter

      class SegmentedLines(l: Iterable[String]) <meta http-equiv='Content-Type' content='text/html; charset=UTF-8' />



      My tests are here:



      package edu.washington.rippeth.ling473.proj3

      import org.scalatest.FlatSpec

      class StateMachineTest extends FlatSpec

      "A StateMachine" should "segment the example" in

      val input: Seq[String] = Seq("แบ่งแผ่นดินออกเป็นสองสว่น", "หลุมพอต้นใหญ่งอกงามชิดตลิ่ง")
      val expectedOutput = Seq("แบ่ง แผ่น ดิน ออก เป็น สอง สว่น", "หลุม พอ ต้น ให ญ่ง อก งาม ชิด ตลิ่ง")
      val stateMachine: StateMachine = new StateMachine(input)

      assertResult(expectedOutput)
      stateMachine.segmentLines.lines






      and



      package edu.washington.rippeth.ling473.proj3

      import org.scalatest.Matchers, WordSpecLike

      class UtilsSpec extends WordSpecLike with Matchers

      private def withCleanStringBuilder(block: StringBuilder => Any): Unit =
      val sb = new StringBuilder
      block(sb)


      "optionallyAppend" should
      "append when the character is not None" in withCleanStringBuilder sb =>
      sb shouldBe empty
      val charToAdd = 'c'
      val char @ Some(c) = Some(charToAdd)
      val newSb = optionallyAppend(sb, char)
      newSb.length should===(1)
      newSb.headOption shouldBe char


      "not append when the character is None" in withCleanStringBuilder sb =>
      sb shouldBe empty
      val char = None
      val newSb = optionallyAppend(sb, char)
      newSb shouldBe empty



      "addSpaceBeforeLast" should
      "add a space before the last character" in withCleanStringBuilder sb =>
      val (first, last) = ("hell", "o")
      val totalLength = first.length + last.length
      sb.appendAll(s"$first$last")
      val newSb = addSpaceBeforeLast(sb)
      // Added a space, so length is updated
      newSb.length should===(totalLength + 1)
      val expectedString = s"$first $last"
      newSb.toString should===(expectedString)



      "addSpaceAtEnd" should
      "add a space at the last character" in withCleanStringBuilder sb =>
      val expectedString = "hello "
      sb.appendAll(expectedString.trim)
      val newSb = addSpaceAtEnd(sb)
      newSb.toString should===(expectedString)






      I'm interested in simplifying the states if possible and also general best practices. This implementation is fast enough™, but any performance improvements would be helpful (though I don't expect crazy benchmarking).







      share|improve this question













      I am developing a FSM to segment Thai syllables with the following rules:



      enter image description here



      Here are my util functions:



      package edu.washington.rippeth.ling473

      package object proj3

      private[proj3] def optionallyAppend(sb: StringBuilder, char: Option[Char]): StringBuilder = char match
      case Some(c) =>
      sb.append(c)
      sb
      case None =>
      sb


      /** Inserts a space before the last element.
      *
      * @param sb the string to which a space should be added
      * @return the string with a space at the n-1st position
      */
      private[proj3] def addSpaceBeforeLast(sb: StringBuilder): StringBuilder =
      sb.insert(sb.length-1, ' ')
      sb


      /** Appends a space to the StringBuilder.
      *
      * @param sb the StringBuilder to which a space should be added
      * @return the StringBuilder with a space at the end
      */
      private[proj3] def addSpaceAtEnd(sb: StringBuilder): StringBuilder =
      sb.append(' ')
      sb





      Here are my states:



      package edu.washington.rippeth.ling473.proj3

      private[proj3] sealed trait State
      /**
      * The string builder before a character is potentially applied.
      */
      protected def stringBuilderIn: StringBuilder

      /**
      * The character which can trigger an update in state.
      *
      * Creating an Option[Char] is a hack because the empty char literal does not
      * exist in Scala.
      */
      protected def char: Option[Char]

      /**
      * An optional function which can transform the tape after the string builder
      * may have been updated. This will be `None` in the normal case.
      */
      protected def postProcessesStringBuilder: Option[StringBuilder => StringBuilder]

      /**
      * If the state is a "normal" state
      * @return
      */
      def stringBuilderOut: StringBuilder = postProcessesStringBuilder match
      case Some(f) => f(optionallyAppend(stringBuilderIn, char))
      case None => optionallyAppend(stringBuilderIn, char)



      private[proj3] sealed trait NormalState extends State
      /**
      * In the normal case, we don't post process the string builder -- potentially adding
      * the character is good enough.
      * @return
      */
      final protected def postProcessesStringBuilder: Option[StringBuilder => StringBuilder] = None


      private[proj3] sealed trait StateWithPenultimateSpace extends State
      /**
      * In state 7 and 8, a space is to be added before the penultimate character,
      * so this will be the post-processing function.
      */
      final protected def postProcessesStringBuilder: Option[StringBuilder => StringBuilder] =
      Some(addSpaceBeforeLast)


      private[proj3] sealed trait StateWithFinalSpace extends State
      /**
      * In state 9, a space is to be appended to the string, so this will be
      * the post-processing function.
      */
      final protected def postProcessesStringBuilder: Option[StringBuilder => StringBuilder] =
      Some(addSpaceAtEnd)


      // State0 through State6 are very vanilla -- just appending
      // a character to the output string
      private[proj3] final case class State0(stringBuilderIn: StringBuilder, char: Option[Char])
      extends NormalState
      private[proj3] final case class State1(stringBuilderIn: StringBuilder, char: Option[Char])
      extends NormalState
      private[proj3] final case class State2(stringBuilderIn: StringBuilder, char: Option[Char])
      extends NormalState
      private[proj3] final case class State3(stringBuilderIn: StringBuilder, char: Option[Char])
      extends NormalState
      private[proj3] final case class State4(stringBuilderIn: StringBuilder, char: Option[Char])
      extends NormalState
      private[proj3] final case class State5(stringBuilderIn: StringBuilder, char: Option[Char])
      extends NormalState
      private[proj3] final case class State6(stringBuilderIn: StringBuilder, char: Option[Char])
      extends NormalState

      // State7 and State8 are special -- add a space before the last character
      // in the output string
      private[proj3] final case class State7(stringBuilderIn: StringBuilder, char: Option[Char])
      extends StateWithPenultimateSpace
      private[proj3] final case class State8(stringBuilderIn: StringBuilder, char: Option[Char])
      extends StateWithPenultimateSpace

      // State9 is also special -- add a space to the end of the output string
      private[proj3] final case class State9(stringBuilderIn: StringBuilder, char: Option[Char])
      extends StateWithFinalSpace


      And here is my state machine:



      package edu.washington.rippeth.ling473.proj3

      import com.typesafe.scalalogging.LazyLogging

      /** Given input lines, moves through the state machine for
      * each line
      *
      * @param lines the unsegmented Thai lines
      */
      class StateMachine(lines: Iterable[String]) extends LazyLogging

      // The categories that force transitions
      private final val V1: Set[Char] = "เแโใไ".toSet
      private final val C1: Set[Char] = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮ".toSet
      private final val C2: Set[Char] = "รลวนม".toSet
      private final val V2: Set[Char] = "◌ิ◌ี◌ึ◌ื◌ุ◌ู◌ั◌็".toSet
      private final val T: Set[Char] = Set('u0E48', 'u0E49', 'u0E4A', 'u0E4B')
      private final val V3: Set[Char] = "าอยว".toSet
      private final val C3: Set[Char] = "งนมดบกยว".toSet

      // The following methods define the actions to be performed
      // (i.e., defining to which state the machine should transition)
      // The naming convention is actionN where N is the state before
      // transition

      private def action0(implicit oldState: State, c: Char): State =
      if(V1.contains(c))
      logger.trace("Going to state 1")
      State1(oldState.stringBuilderOut, Some(c))
      else if(C1.contains(c))
      logger.trace("Going to state 2")
      State2(oldState.stringBuilderOut, Some(c))
      else
      throw new IllegalArgumentException(s"Invalid input '$c'")

      private def action1(oldState: State, c: Char): State =
      if(C1.contains(c))
      logger.trace("Going to state 2")
      State2(oldState.stringBuilderOut, Some(c))
      else
      throw new IllegalArgumentException(s"Invalid input '$c'")

      private def action2(oldState: State, c: Char): State =
      if(C2.contains(c))
      logger.trace("Going to state 3")
      State3(oldState.stringBuilderOut, Some(c))
      else if(V2.contains(c))
      logger.trace("Going to state 4")
      State4(oldState.stringBuilderOut, Some(c))
      else if(T.contains(c))
      logger.trace("Going to state 5")
      State5(oldState.stringBuilderOut, Some(c))
      else if(V3.contains(c))
      logger.trace("Going to state 6")
      State6(oldState.stringBuilderOut, Some(c))
      else if(C3.contains(c))
      logger.trace("Going to state 9")
      State9(oldState.stringBuilderOut, Some(c))
      else if(V1.contains(c))
      logger.trace("Going to state 7")
      State7(oldState.stringBuilderOut, Some(c))
      else if(C1.contains(c))
      logger.trace("Going to state 8")
      State8(oldState.stringBuilderOut, Some(c))
      else
      throw new IllegalArgumentException(s"Invalid input '$c'")

      private def action3(oldState: State, c: Char): State =
      if(V2.contains(c))
      logger.trace("Going to state 4")
      State4(oldState.stringBuilderOut, Some(c))
      else if(T.contains(c))
      logger.trace("Going to state 5")
      State5(oldState.stringBuilderOut, Some(c))
      else if(V3.contains(c))
      logger.trace("Going to state 6")
      State6(oldState.stringBuilderOut, Some(c))
      else if(C3.contains(c))
      logger.trace("Going to state 9")
      State9(oldState.stringBuilderOut, Some(c))
      else
      throw new IllegalArgumentException(s"Invalid input '$c'")

      private def action4(oldState: State, c: Char): State =
      if(T.contains(c))
      logger.trace("Going to state 5")
      State5(oldState.stringBuilderOut, Some(c))
      else if(V3.contains(c))
      logger.trace("Going to state 6")
      State6(oldState.stringBuilderOut, Some(c))
      else if(C3.contains(c))
      logger.trace("Going to state 9")
      State9(oldState.stringBuilderOut, Some(c))
      else if(V1.contains(c))
      logger.trace("Going to state 7")
      State7(oldState.stringBuilderOut, Some(c))
      else if(C1.contains(c))
      logger.trace("Going to state 8")
      State8(oldState.stringBuilderOut, Some(c))
      else
      throw new IllegalArgumentException(s"Invalid input '$c'")

      private def action5(oldState: State, c: Char): State =
      if(V3.contains(c))
      logger.trace("Going to state 6")
      State6(oldState.stringBuilderOut, Some(c))
      else if(C3.contains(c))
      logger.trace("Going to state 9")
      State9(oldState.stringBuilderOut, Some(c))
      else if(V1.contains(c))
      logger.trace("Going to state 7")
      State7(oldState.stringBuilderOut, Some(c))
      else if(C1.contains(c))
      logger.trace("Going to state 8")
      State8(oldState.stringBuilderOut, Some(c))
      else
      throw new IllegalArgumentException(s"Invalid input '$c'")

      private def action6(oldState: State, c: Char): State =
      if(C3.contains(c))
      logger.trace("Going to state 9")
      State9(oldState.stringBuilderOut, Some(c))
      else if(V1.contains(c))
      logger.trace("Going to state 7")
      State7(oldState.stringBuilderOut, Some(c))
      else if(C1.contains(c))
      logger.trace("Going to state 8")
      State8(oldState.stringBuilderOut, Some(c))
      else
      throw new IllegalArgumentException(s"Invalid input '$c'")

      // Because actions 7, 8, and 9 don't consume input,
      // they will simply be a pass-through to the action of the
      // state for which they are acting as a proxy
      private def action7(oldState: State, c: Char): State = action1(oldState, c)
      private def action8(oldState: State, c: Char): State = action2(oldState, c)
      private def action9(oldState: State, c: Char): State = action0(oldState, c)

      /** Given a state and a character, choose the next action to perform
      * (i.e., choose the next state)
      *
      * @param state the state of the machine right now
      * @param c the character triggering a transition
      * @return the new state of the machine
      */
      private def segmentationFunction(state: State, c: Char): State = state match
      case s: State0 => action0(s, c)
      case s: State1 => action1(s, c)
      case s: State2 => action2(s, c)
      case s: State3 => action3(s, c)
      case s: State4 => action4(s, c)
      case s: State5 => action5(s, c)
      case s: State6 => action6(s, c)
      case s: State7 => action7(s, c)
      case s: State8 => action8(s, c)
      case s: State9 => action9(s, c)


      logger.trace("In state 0")
      private final def initialState: State = State0(new StringBuilder, None)

      /** For a given line, this function will work through
      * the entire transition of state in the state machine.
      *
      * @param line the line being processed
      * @return the final state
      */
      private def transition(line: String, initialState: State): State =
      // Given a state (with initial value of State0(new StringBuilder)) and a character
      // (left to right in the string), apply segmentationFunction. Update
      // the state and move to next character in the line.
      val state = line.foldLeft(initialState) (state, c) =>
      logger.trace(s"Handling $c")
      segmentationFunction(state, c)

      logger.debug(s"Processed line: $state.stringBuilderOut.toString()")
      state


      /** Apply transition to the given line and return the output string from
      * the resultant state
      *
      * @param line the line being processed
      * @return the output string of the final state
      */
      private def segmentLine(line: String): String = transition(line, initialState).stringBuilderOut.toString

      /** Segments all lines
      *
      * @return the SegmentedLines of all the inputs
      */
      def segmentLines: SegmentedLines = new SegmentedLines(lines.map(segmentLine))



      Here, SegmentedLines is just a class that will transform the output into a standard form -- it is beyond the scope of the review, but for completeness, it's below:



      package edu.washington.rippeth.ling473.proj3

      import java.io.BufferedWriter, File, FileWriter

      class SegmentedLines(l: Iterable[String]) <meta http-equiv='Content-Type' content='text/html; charset=UTF-8' />



      My tests are here:



      package edu.washington.rippeth.ling473.proj3

      import org.scalatest.FlatSpec

      class StateMachineTest extends FlatSpec

      "A StateMachine" should "segment the example" in

      val input: Seq[String] = Seq("แบ่งแผ่นดินออกเป็นสองสว่น", "หลุมพอต้นใหญ่งอกงามชิดตลิ่ง")
      val expectedOutput = Seq("แบ่ง แผ่น ดิน ออก เป็น สอง สว่น", "หลุม พอ ต้น ให ญ่ง อก งาม ชิด ตลิ่ง")
      val stateMachine: StateMachine = new StateMachine(input)

      assertResult(expectedOutput)
      stateMachine.segmentLines.lines






      and



      package edu.washington.rippeth.ling473.proj3

      import org.scalatest.Matchers, WordSpecLike

      class UtilsSpec extends WordSpecLike with Matchers

      private def withCleanStringBuilder(block: StringBuilder => Any): Unit =
      val sb = new StringBuilder
      block(sb)


      "optionallyAppend" should
      "append when the character is not None" in withCleanStringBuilder sb =>
      sb shouldBe empty
      val charToAdd = 'c'
      val char @ Some(c) = Some(charToAdd)
      val newSb = optionallyAppend(sb, char)
      newSb.length should===(1)
      newSb.headOption shouldBe char


      "not append when the character is None" in withCleanStringBuilder sb =>
      sb shouldBe empty
      val char = None
      val newSb = optionallyAppend(sb, char)
      newSb shouldBe empty



      "addSpaceBeforeLast" should
      "add a space before the last character" in withCleanStringBuilder sb =>
      val (first, last) = ("hell", "o")
      val totalLength = first.length + last.length
      sb.appendAll(s"$first$last")
      val newSb = addSpaceBeforeLast(sb)
      // Added a space, so length is updated
      newSb.length should===(totalLength + 1)
      val expectedString = s"$first $last"
      newSb.toString should===(expectedString)



      "addSpaceAtEnd" should
      "add a space at the last character" in withCleanStringBuilder sb =>
      val expectedString = "hello "
      sb.appendAll(expectedString.trim)
      val newSb = addSpaceAtEnd(sb)
      newSb.toString should===(expectedString)






      I'm interested in simplifying the states if possible and also general best practices. This implementation is fast enough™, but any performance improvements would be helpful (though I don't expect crazy benchmarking).









      share|improve this question












      share|improve this question




      share|improve this question








      edited Jun 9 at 0:25









      Jamal♦

      30.1k11114225




      30.1k11114225









      asked Jun 8 at 12:32









      erip

      456214




      456214

























          active

          oldest

          votes











          Your Answer




          StackExchange.ifUsing("editor", function ()
          return StackExchange.using("mathjaxEditing", function ()
          StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
          StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
          );
          );
          , "mathjax-editing");

          StackExchange.ifUsing("editor", function ()
          StackExchange.using("externalEditor", function ()
          StackExchange.using("snippets", function ()
          StackExchange.snippets.init();
          );
          );
          , "code-snippets");

          StackExchange.ready(function()
          var channelOptions =
          tags: "".split(" "),
          id: "196"
          ;
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function()
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled)
          StackExchange.using("snippets", function()
          createEditor();
          );

          else
          createEditor();

          );

          function createEditor()
          StackExchange.prepareEditor(
          heartbeatType: 'answer',
          convertImagesToLinks: false,
          noModals: false,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: null,
          bindNavPrevention: true,
          postfix: "",
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          );



          );








           

          draft saved


          draft discarded


















          StackExchange.ready(
          function ()
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f196112%2ffinite-state-machine-to-segment-thai-syllables%23new-answer', 'question_page');

          );

          Post as a guest



































          active

          oldest

          votes













          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes










           

          draft saved


          draft discarded


























           


          draft saved


          draft discarded














          StackExchange.ready(
          function ()
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f196112%2ffinite-state-machine-to-segment-thai-syllables%23new-answer', 'question_page');

          );

          Post as a guest













































































          Popular posts from this blog

          Greedy Best First Search implementation in Rust

          Function to Return a JSON Like Objects Using VBA Collections and Arrays

          C++11 CLH Lock Implementation