ml-cb

This model is a fine-tuned version of dbernsohn/roberta-javascript on the ml-cb dataset. This model is trained on plaintext (as opposed to text processed with jsNice text or a JS obfuscator). More information about the project may be found at OSF.

This model is the transformer version of ml-cb. It achieves the following results on the evaluation set:

Train Loss: 0.0357
Validation Loss: 0.0525
Train Accuracy: 0.9849
Epoch: 4

---EXAMPLE input #1: Canonical Fingerprinting (should be true)---
function fingerprint() {
    var canvas = document.createElement('canvas');
    var ctx = canvas.getContext('2d');
    var txt = 'i9asdm..$#po((^@KbXrww!~cz';
    ctx.textBaseline = 'top';
    ctx.font = '16px '
    Arial '';
    ctx.textBaseline = 'alphabetic';
    ctx.rotate(.05);
    ctx.fillStyle = '#f60';
    ctx.fillRect(125, 1, 62, 20);
    ctx.fillStyle = '#069';
    ctx.fillText(txt, 2, 15);
    ctx.fillStyle = 'rgba(102, 200, 0, 0.7)';
    ctx.fillText(txt, 4, 17);
    ctx.shadowBlur = 10;
    ctx.shadowColor = 'blue';
    ctx.fillRect(-20, 10, 234, 5);
    var strng = canvas.toDataURL();
}
---EXAMPLE input #1: WPemoji False Positive (should be false)---
window._wpemojiSettings = {
    'baseUrl': 'http:\/\/s.w.org\/images\/core\/emoji\/72x72\/',
    'ext': '.png',
    'source': {
        'concatemoji': 'http:\/\/basho.com\/wp-includes\/js\/wp-emoji-release.min.js?ver=4.2.2'
    }
};
! function(a, b, c) {
    function d(a) {
        var c = b.createElement('canvas'),
            d = c.getContext && c.getContext('2d');
        return d && d.fillText ? (d.textBaseline = 'top', d.font = '600 32px Arial', 'flag' === a ? (d.fillText(String.fromCharCode(55356, 56812, 55356, 56807), 0, 0), c.toDataURL().length > 3e3) : (d.fillText(String.fromCharCode(55357, 56835), 0, 0), 0 !== d.getImageData(16, 16, 1, 1).data[0])) : !1
    }

    function e(a) {
        var c = b.createElement('script');
        c.src = a, c.type = 'text/javascript', b.getElementsByTagName('head')[0].appendChild(c)
    }
    var f, g;
    c.supports = {
        simple: d('simple'),
        flag: d('flag')
    }, c.DOMReady = !1, c.readyCallback = function() {
        c.DOMReady = !0
    }, c.supports.simple && c.supports.flag || (g = function() {
        c.readyCallback()
    }, b.addEventListener ? (b.addEventListener('DOMContentLoaded', g, !1), a.addEventListener('load', g, !1)) : (a.attachEvent('onload', g), b.attachEvent('onreadystatechange', function() {
        'complete' === b.readyState && c.readyCallback()
    })), f = c.source || {}, f.concatemoji ? e(f.concatemoji) : f.wpemoji && f.twemoji && (e(f.twemoji), e(f.wpemoji)))
}(window, document, window._wpemojiSettings);

Intended uses

The model would be used to detect canvas fingerprinting in a JavaScript program. Canvas fingerprinting is stateless form of web tracking allowing companies to track you.

Limitations

Hosted Inference (and tokenizer and model) truncates text past 512 tokens!

Training and evaluation data

Training data cleaned from original dataset.

Training procedure

Training hyperparameters

The following hyperparameters were used during training:

optimizer: {'name': 'Adam', 'weight_decay': None, 'clipnorm': None, 'global_clipnorm': None, 'clipvalue': None, 'use_ema': False, 'ema_momentum': 0.99, 'ema_overwrite_frequency': None, 'jit_compile': False, 'is_legacy_optimizer': False, 'learning_rate': {'class_name': 'PolynomialDecay', 'config': {'initial_learning_rate': 2e-05, 'decay_steps': 1300, 'end_learning_rate': 0.0, 'power': 1.0, 'cycle': False, 'name': None}}, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-08, 'amsgrad': False}
training_precision: float32

Training results

Train Loss	Validation Loss	Train Accuracy	Epoch
0.1217	0.0753	0.9741	0
0.0799	0.0651	0.9741	1
0.0639	0.0471	0.9870	2
0.0459	0.0539	0.9806	3
0.0357	0.0525	0.9849	4

Framework versions

Transformers 4.30.2
TensorFlow 2.11.0
Datasets 2.13.2
Tokenizers 0.13.3