ml-cb
This model is a fine-tuned version of dbernsohn/roberta-javascript on the ml-cb dataset. This model is trained on plaintext (as opposed to text processed with jsNice text or a JS obfuscator). More information about the project may be found at OSF.
This model is the transformer version of ml-cb. It achieves the following results on the evaluation set:
- Train Loss: 0.0357
- Validation Loss: 0.0525
- Train Accuracy: 0.9849
- Epoch: 4
---EXAMPLE input #1: Canonical Fingerprinting (should be true)---
function fingerprint() {
var canvas = document.createElement('canvas');
var ctx = canvas.getContext('2d');
var txt = 'i9asdm..$#po((^@KbXrww!~cz';
ctx.textBaseline = 'top';
ctx.font = '16px '
Arial '';
ctx.textBaseline = 'alphabetic';
ctx.rotate(.05);
ctx.fillStyle = '#f60';
ctx.fillRect(125, 1, 62, 20);
ctx.fillStyle = '#069';
ctx.fillText(txt, 2, 15);
ctx.fillStyle = 'rgba(102, 200, 0, 0.7)';
ctx.fillText(txt, 4, 17);
ctx.shadowBlur = 10;
ctx.shadowColor = 'blue';
ctx.fillRect(-20, 10, 234, 5);
var strng = canvas.toDataURL();
}
---EXAMPLE input #1: WPemoji False Positive (should be false)---
window._wpemojiSettings = {
'baseUrl': 'http:\/\/s.w.org\/images\/core\/emoji\/72x72\/',
'ext': '.png',
'source': {
'concatemoji': 'http:\/\/basho.com\/wp-includes\/js\/wp-emoji-release.min.js?ver=4.2.2'
}
};
! function(a, b, c) {
function d(a) {
var c = b.createElement('canvas'),
d = c.getContext && c.getContext('2d');
return d && d.fillText ? (d.textBaseline = 'top', d.font = '600 32px Arial', 'flag' === a ? (d.fillText(String.fromCharCode(55356, 56812, 55356, 56807), 0, 0), c.toDataURL().length > 3e3) : (d.fillText(String.fromCharCode(55357, 56835), 0, 0), 0 !== d.getImageData(16, 16, 1, 1).data[0])) : !1
}
function e(a) {
var c = b.createElement('script');
c.src = a, c.type = 'text/javascript', b.getElementsByTagName('head')[0].appendChild(c)
}
var f, g;
c.supports = {
simple: d('simple'),
flag: d('flag')
}, c.DOMReady = !1, c.readyCallback = function() {
c.DOMReady = !0
}, c.supports.simple && c.supports.flag || (g = function() {
c.readyCallback()
}, b.addEventListener ? (b.addEventListener('DOMContentLoaded', g, !1), a.addEventListener('load', g, !1)) : (a.attachEvent('onload', g), b.attachEvent('onreadystatechange', function() {
'complete' === b.readyState && c.readyCallback()
})), f = c.source || {}, f.concatemoji ? e(f.concatemoji) : f.wpemoji && f.twemoji && (e(f.twemoji), e(f.wpemoji)))
}(window, document, window._wpemojiSettings);
Intended uses
The model would be used to detect canvas fingerprinting in a JavaScript program. Canvas fingerprinting is stateless form of web tracking allowing companies to track you.
Limitations
- Hosted Inference (and tokenizer and model) truncates text past 512 tokens!
Training and evaluation data
Training data cleaned from original dataset.
Training procedure
Training hyperparameters
The following hyperparameters were used during training:
- optimizer: {'name': 'Adam', 'weight_decay': None, 'clipnorm': None, 'global_clipnorm': None, 'clipvalue': None, 'use_ema': False, 'ema_momentum': 0.99, 'ema_overwrite_frequency': None, 'jit_compile': False, 'is_legacy_optimizer': False, 'learning_rate': {'class_name': 'PolynomialDecay', 'config': {'initial_learning_rate': 2e-05, 'decay_steps': 1300, 'end_learning_rate': 0.0, 'power': 1.0, 'cycle': False, 'name': None}}, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-08, 'amsgrad': False}
- training_precision: float32
Training results
Train Loss | Validation Loss | Train Accuracy | Epoch |
---|---|---|---|
0.1217 | 0.0753 | 0.9741 | 0 |
0.0799 | 0.0651 | 0.9741 | 1 |
0.0639 | 0.0471 | 0.9870 | 2 |
0.0459 | 0.0539 | 0.9806 | 3 |
0.0357 | 0.0525 | 0.9849 | 4 |
Framework versions
- Transformers 4.30.2
- TensorFlow 2.11.0
- Datasets 2.13.2
- Tokenizers 0.13.3