Skip to content

Commit

Permalink
feat!: update general-sam to 1.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
ChieloNewctle committed Mar 27, 2024
1 parent 1f59cec commit 360e0c3
Show file tree
Hide file tree
Showing 13 changed files with 107 additions and 102 deletions.
10 changes: 5 additions & 5 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 7 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "general-sam-py"
version = "0.7.0-post0"
version = "1.0.0"
edition = "2021"
license = "MIT OR Apache-2.0"
description = "Python bindings for general-sam and some utilities"
Expand All @@ -15,5 +15,10 @@ crate-type = ["cdylib"]

[dependencies]
either = "1.10.0"
general-sam = { version = "0.7.0", features = ["all"] }
general-sam = { version = "1.0.0", features = ["all"] }
pyo3 = { version = "0.21.0", features = ["extension-module", "generate-import-lib", "abi3-py38"] }

[profile.release]
lto = true
strip = true
opt-level = "z"
20 changes: 10 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,12 @@ pip install general-sam

## Usage

### `GeneralSAM`
### `GeneralSam`

```python
from general_sam import GeneralSAM
from general_sam import GeneralSam

sam = GeneralSAM.from_bytes(b"abcbc")
sam = GeneralSam.from_bytes(b"abcbc")

# "cbc" is a suffix of "abcbc"
state = sam.get_root_state()
Expand All @@ -58,9 +58,9 @@ assert not state.is_accepting()
```

```python
from general_sam import GeneralSAM
from general_sam import GeneralSam

sam = GeneralSAM.from_chars("abcbc")
sam = GeneralSam.from_chars("abcbc")
state = sam.get_root_state()

# "b" is not a suffix but at least a substring of "abcbc"
Expand All @@ -81,13 +81,13 @@ assert not state.is_accepting() and state.is_nil()
```

```python
from general_sam import GeneralSAM, GeneralSAMState, build_trie_from_chars
from general_sam import GeneralSam, GeneralSamState, build_trie_from_chars

trie, _ = build_trie_from_chars(["hello", "Chielo"])
sam = GeneralSAM.from_trie(trie)
sam = GeneralSam.from_trie(trie)


def fetch_state(s: str) -> GeneralSAMState:
def fetch_state(s: str) -> GeneralSamState:
state = sam.get_root_state()
state.feed_chars(s)
return state
Expand Down Expand Up @@ -193,7 +193,7 @@ assert state.is_nil()
### `GreedyTokenizer`

```python
from general_sam import GeneralSAM, GreedyTokenizer, build_trie_from_chars
from general_sam import GeneralSam, GreedyTokenizer, build_trie_from_chars

vocab = ["a", "ab", "b", "bc", "c", "d", "e", "f", "cd", "abcde"]
trie, token_to_trie_node = build_trie_from_chars(vocab)
Expand All @@ -202,7 +202,7 @@ trie_node_to_token = [-1] * trie.num_of_nodes()
for i, j in enumerate(token_to_trie_node):
trie_node_to_token[j] = i

sam = GeneralSAM.from_trie(trie)
sam = GeneralSam.from_trie(trie)
tokenizer = GreedyTokenizer.from_sam_and_trie(sam, trie)


Expand Down
8 changes: 4 additions & 4 deletions general_sam/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .general_sam import (
GeneralSAM,
GeneralSAMState,
GeneralSam,
GeneralSamState,
GreedyTokenizer,
Trie,
TrieNode,
Expand All @@ -20,8 +20,8 @@
)

__all__ = [
"GeneralSAM",
"GeneralSAMState",
"GeneralSam",
"GeneralSamState",
"GreedyTokenizer",
"Trie",
"TrieNode",
Expand Down
38 changes: 19 additions & 19 deletions general_sam/general_sam.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ from typing import Callable, Mapping, Optional, Sequence, Tuple, Union

ByteOrChar = Union[str, int]
TrieNodeID = int
GeneralSAMNodeID = int
GeneralSamNodeID = int

NIL_NODE_ID = 0
ROOT_NODE_ID = 1
Expand Down Expand Up @@ -41,16 +41,16 @@ class Trie:
root_node_id: Optional[TrieNodeID] = None,
) -> TrieNode: ...

class GeneralSAMState:
class GeneralSamState:
def is_in_chars(self) -> bool: ...
def is_in_bytes(self) -> bool: ...
def get_node_id(self) -> GeneralSAMNodeID: ...
def get_node_id(self) -> GeneralSamNodeID: ...
def is_nil(self) -> bool: ...
def is_root(self) -> bool: ...
def is_accepting(self) -> bool: ...
def get_trans(self) -> Mapping[ByteOrChar, GeneralSAMNodeID]: ...
def get_suffix_parent_id(self) -> GeneralSAMNodeID: ...
def copy(self) -> "GeneralSAMState": ...
def get_trans(self) -> Mapping[ByteOrChar, GeneralSamNodeID]: ...
def get_suffix_parent_id(self) -> GeneralSamNodeID: ...
def clone(self) -> "GeneralSamState": ...
def goto_suffix_parent(self) -> None: ...
def goto_char(self, t: str) -> None: ...
def goto_byte(self, t: int) -> None: ...
Expand All @@ -60,39 +60,39 @@ class GeneralSAMState:
self,
trie: Trie,
in_stack_callback: Callable[
["GeneralSAMState", TrieNodeID, Optional[ByteOrChar]], None
["GeneralSamState", TrieNodeID, Optional[ByteOrChar]], None
],
out_stack_callback: Callable[["GeneralSAMState", TrieNodeID], None],
out_stack_callback: Callable[["GeneralSamState", TrieNodeID], None],
trie_node_id: Optional[TrieNodeID] = None,
) -> TrieNode: ...
def bfs_along(
self,
trie: Trie,
in_queue_callback: Callable[
["GeneralSAMState", TrieNodeID, Optional[ByteOrChar]], None
["GeneralSamState", TrieNodeID, Optional[ByteOrChar]], None
],
out_queue_callback: Callable[["GeneralSAMState", TrieNodeID], None],
out_queue_callback: Callable[["GeneralSamState", TrieNodeID], None],
trie_node_id: Optional[TrieNodeID] = None,
) -> TrieNode: ...

class GeneralSAM:
class GeneralSam:
@staticmethod
def from_chars(s: str) -> "GeneralSAM": ...
def from_chars(s: str) -> "GeneralSam": ...
@staticmethod
def from_bytes(s: bytes) -> "GeneralSAM": ...
def from_bytes(s: bytes) -> "GeneralSam": ...
@staticmethod
def from_trie(trie: Trie) -> "GeneralSAM": ...
def from_trie(trie: Trie) -> "GeneralSam": ...
def is_in_chars(self) -> bool: ...
def is_in_bytes(self) -> bool: ...
def num_of_nodes(self) -> int: ...
def get_root_state(self) -> GeneralSAMState: ...
def get_state(self, node_id: GeneralSAMNodeID) -> GeneralSAMState: ...
def get_topo_and_suf_len_sorted_states(self) -> Sequence[GeneralSAMState]: ...
def get_root_state(self) -> GeneralSamState: ...
def get_state(self, node_id: GeneralSamNodeID) -> GeneralSamState: ...
def get_topo_and_suf_len_sorted_states(self) -> Sequence[GeneralSamState]: ...

class GreedyTokenizer:
@staticmethod
def from_sam_and_trie(sam: GeneralSAM, trie: Trie) -> "GreedyTokenizer": ...
def get_sam(self) -> GeneralSAM: ...
def from_sam_and_trie(sam: GeneralSam, trie: Trie) -> "GreedyTokenizer": ...
def get_sam(self) -> GeneralSam: ...
def is_in_chars(self) -> bool: ...
def is_in_bytes(self) -> bool: ...
def tokenize_str(
Expand Down
14 changes: 7 additions & 7 deletions general_sam/vocab_prefix.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
cast,
)

from .general_sam import GeneralSAM, GeneralSAMState, Trie
from .general_sam import GeneralSam, GeneralSamState, Trie
from .trie_utils import (
CountInfo,
SortResult,
Expand Down Expand Up @@ -63,14 +63,14 @@ def __init__(
trie_builder(self.vocab_rev),
)

self.sam_rev = GeneralSAM.from_trie(self.trie_rev)
self.sam_rev = GeneralSam.from_trie(self.trie_rev)
self._gen_cnt_info_in_sam()

@property
def _state_feed_fn(self) -> Callable[[GeneralSAMState, Union[bytes, str]], None]:
def _state_feed_fn(self) -> Callable[[GeneralSamState, Union[bytes, str]], None]:
return {
VocabPrefixBytesOrChars.BYTES: GeneralSAMState.feed_bytes,
VocabPrefixBytesOrChars.CHARS: GeneralSAMState.feed_chars,
VocabPrefixBytesOrChars.BYTES: GeneralSamState.feed_bytes,
VocabPrefixBytesOrChars.CHARS: GeneralSamState.feed_chars,
}[self.bytes_or_chars]

def _gen_cnt_info_in_sam(self):
Expand Down Expand Up @@ -127,11 +127,11 @@ def _gen_cnt_info_in_sam(self):
assert link_cnt_info.tot_cnt_lower <= state_cnt_info.tot_cnt_lower
assert link_cnt_info.tot_cnt_upper >= state_cnt_info.tot_cnt_upper

def get_root_state(self) -> GeneralSAMState:
def get_root_state(self) -> GeneralSamState:
return self.sam_rev.get_root_state()

def prepend_feed(
self, state: GeneralSAMState, token: Union[str, bytes]
self, state: GeneralSamState, token: Union[str, bytes]
) -> Optional[CountInfo]:
if self.bytes_or_chars == VocabPrefixBytesOrChars.BYTES and isinstance(
token, str
Expand Down
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ use pyo3::prelude::*;
fn general_sam(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<trie::TrieNode>()?;
m.add_class::<trie::Trie>()?;
m.add_class::<sam::GeneralSAMState>()?;
m.add_class::<sam::GeneralSAM>()?;
m.add_class::<sam::GeneralSamState>()?;
m.add_class::<sam::GeneralSam>()?;
m.add_class::<tokenizer::GreedyTokenizer>()?;
Ok(())
}
Loading

0 comments on commit 360e0c3

Please sign in to comment.